Module: MarkdownServer::Helpers::FetchHelpers

Defined in:
lib/markdown_server/helpers/fetch_helpers.rb

Constant Summary collapse

CSS_TTL =

1 hour

3600
FETCH_MAX_BYTES =
512_000
FETCH_TIMEOUT =
5
ALLOWED_HTML =

Tags kept as-is (attributes stripped)

%w[p h1 h2 h3 h4 h5 h6 blockquote ul ol li
pre br hr strong b em i sup sub code
table tr td th].to_set
BLOCK_HTML =

Block containers — replaced with a newline (content kept)

%w[div section aside figure figcaption
thead tbody tfoot].to_set
STRIP_FULL =

Elements removed completely, including their content

%w[script style nav header footer form input
button select textarea svg iframe noscript].to_set
@@css_cache =

CSS cache for external stylesheets (keyed by absolute URL)

{}

Instance Method Summary collapse

Instance Method Details

#fetch_css(url_str) ⇒ Object

── HTTP Fetching ─────────────────────────────────────────────────────



24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 24

def fetch_css(url_str)
  cached = @@css_cache[url_str]
  return cached[:body] if cached && (Time.now - cached[:at]) < CSS_TTL

  uri = URI.parse(url_str)
  return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = (uri.scheme == "https")
  http.open_timeout = FETCH_TIMEOUT
  http.read_timeout = FETCH_TIMEOUT
  req = Net::HTTP::Get.new(uri.request_uri)
  req["Accept"] = "text/css"
  resp = http.request(req)
  body = resp.is_a?(Net::HTTPSuccess) ? resp.body.to_s.encode("utf-8", invalid: :replace, undef: :replace) : nil
  @@css_cache[url_str] = { body: body, at: Time.now } if body
  body
rescue
  @@css_cache.dig(url_str, :body)
end

#fetch_external_page(url_str) ⇒ Object



44
45
46
47
48
49
50
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 44

def fetch_external_page(url_str)
  uri = URI.parse(url_str)
  return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
  fetch_follow_redirects(uri, 5)
rescue
  nil
end

#fetch_follow_redirects(uri, limit) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 52

def fetch_follow_redirects(uri, limit)
  return nil if limit <= 0
  http = Net::HTTP.new(uri.host, uri.port)
  http.use_ssl = (uri.scheme == "https")
  http.open_timeout = FETCH_TIMEOUT
  http.read_timeout = FETCH_TIMEOUT
  req = Net::HTTP::Get.new(uri.request_uri)
  req["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
  req["Accept"] = "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8"
  req["Accept-Language"] = "en-US,en;q=0.5"
  resp = http.request(req)
  case resp
  when Net::HTTPSuccess
    ct = resp["content-type"].to_s
    return nil unless ct.match?(/html|text/i)
    body = resp.body.to_s
    body = body.b[0, FETCH_MAX_BYTES].force_encoding("utf-8")
    body.encode("utf-8", invalid: :replace, undef: :replace, replace: "?")
  when Net::HTTPRedirection
    loc = resp["Location"].to_s
    new_uri = (URI.parse(loc) rescue nil)
    return nil unless new_uri
    new_uri = uri + new_uri unless new_uri.absolute?
    return nil unless new_uri.is_a?(URI::HTTP) || new_uri.is_a?(URI::HTTPS)
    fetch_follow_redirects(new_uri, limit - 1)
  end
rescue
  nil
end

#inject_assets_for_html_path?(_relative_path) ⇒ Boolean

── Asset Injection ───────────────────────────────────────────────────

Returns:

  • (Boolean)


114
115
116
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 114

def inject_assets_for_html_path?(_relative_path)
  true
end

#inject_markdownr_assets(html_content) ⇒ Object



118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 118

def inject_markdownr_assets(html_content)
  settings.plugins.each { |p| html_content = p.transform_html(html_content, self) }

  popup_config_script = "<script>var __popupConfig = {" \
    "localMd:#{settings.popup_local_md}," \
    "localHtml:#{settings.popup_local_html}," \
    "external:#{settings.popup_external}," \
    "externalDomains:#{settings.popup_external_domains.to_json}" \
    "};</script>\n"
  assets = popup_config_script + File.read(File.join(settings.views, "popup_assets.erb"))
  inserted = false
  result = html_content.sub(/<\/(body|html)>/i) { inserted = true; "#{assets}</#{$1}>" }
  inserted ? result : html_content + assets
end

#page_html(raw, base_url = nil) ⇒ Object



91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 91

def page_html(raw, base_url = nil)
  w = raw.dup
  STRIP_FULL.each { |t| w.gsub!(/<#{t}[^>]*>.*?<\/#{t}>/im, " ") }
  w.gsub!(/<!--.*?-->/m, " ")
  w.gsub!(/Bible\s+Gateway\s+Recommends[\s\S]*?View\s+more\s+titles/i, " ")
  w.gsub!(/trusted\s+resources\s+beside\s+every\s+verse[\s\S]*?Your\s+Content/i, " ")
  w.gsub!(/Log\s+In\s*\/\s*Sign\s+Up[\s\S]*?Your\s+Content/i, " ")

  content = w.match(/<article[^>]*>(.*?)<\/article>/im)&.[](1) ||
            w.match(/<main[^>]*>(.*?)<\/main>/im)&.[](1) ||
            w.match(/<body[^>]*>(.*?)<\/body>/im)&.[](1) ||
            w

  out = sanitize_html_tags(content, base_url)
  out.sub!(/\A[\s\S]*?(?=<h[1-6]>)/i, "")
  out = decode_html_entities(out)
  out.gsub!(/(<a[^>]*>Read\s+full\s+chapter<\/a>)[\s\S]*?(?=©|Copyright\b)/i, "\\1\n")

  out.length > 10_000 ? out[0, 10_000] : out
end

#page_title(html) ⇒ Object

── HTML Extraction & Sanitization ────────────────────────────────────



84
85
86
87
88
89
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 84

def page_title(html)
  html.match(/<title[^>]*>(.*?)<\/title>/im)&.then { |m|
    m[1].gsub(/<[^>]+>/, "").gsub(/&amp;/i, "&").gsub(/&lt;/i, "<")
        .gsub(/&gt;/i, ">").gsub(/&quot;/i, '"').gsub(/&#?\w+;/, "").strip
  } || ""
end