Module: MarkdownServer::Helpers::FetchHelpers
- Defined in:
- lib/markdown_server/helpers/fetch_helpers.rb
Constant Summary collapse
- CSS_TTL =
1 hour
3600- FETCH_MAX_BYTES =
512_000- FETCH_TIMEOUT =
5- ALLOWED_HTML =
Tags kept as-is (attributes stripped)
%w[p h1 h2 h3 h4 h5 h6 blockquote ul ol li pre br hr strong b em i sup sub code table tr td th].to_set
- BLOCK_HTML =
Block containers — replaced with a newline (content kept)
%w[div section aside figure figcaption thead tbody tfoot].to_set
- STRIP_FULL =
Elements removed completely, including their content
%w[script style nav header footer form input button select textarea svg iframe noscript].to_set
- @@css_cache =
CSS cache for external stylesheets (keyed by absolute URL)
{}
Instance Method Summary collapse
-
#fetch_css(url_str) ⇒ Object
── HTTP Fetching ─────────────────────────────────────────────────────.
- #fetch_external_page(url_str) ⇒ Object
- #fetch_follow_redirects(uri, limit) ⇒ Object
-
#inject_assets_for_html_path?(_relative_path) ⇒ Boolean
── Asset Injection ───────────────────────────────────────────────────.
- #inject_markdownr_assets(html_content) ⇒ Object
- #page_html(raw, base_url = nil) ⇒ Object
-
#page_title(html) ⇒ Object
── HTML Extraction & Sanitization ────────────────────────────────────.
Instance Method Details
#fetch_css(url_str) ⇒ Object
── HTTP Fetching ─────────────────────────────────────────────────────
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 24 def fetch_css(url_str) cached = @@css_cache[url_str] return cached[:body] if cached && (Time.now - cached[:at]) < CSS_TTL uri = URI.parse(url_str) return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = (uri.scheme == "https") http.open_timeout = FETCH_TIMEOUT http.read_timeout = FETCH_TIMEOUT req = Net::HTTP::Get.new(uri.request_uri) req["Accept"] = "text/css" resp = http.request(req) body = resp.is_a?(Net::HTTPSuccess) ? resp.body.to_s.encode("utf-8", invalid: :replace, undef: :replace) : nil @@css_cache[url_str] = { body: body, at: Time.now } if body body rescue @@css_cache.dig(url_str, :body) end |
#fetch_external_page(url_str) ⇒ Object
44 45 46 47 48 49 50 |
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 44 def fetch_external_page(url_str) uri = URI.parse(url_str) return nil unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) fetch_follow_redirects(uri, 5) rescue nil end |
#fetch_follow_redirects(uri, limit) ⇒ Object
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 52 def fetch_follow_redirects(uri, limit) return nil if limit <= 0 http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = (uri.scheme == "https") http.open_timeout = FETCH_TIMEOUT http.read_timeout = FETCH_TIMEOUT req = Net::HTTP::Get.new(uri.request_uri) req["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" req["Accept"] = "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8" req["Accept-Language"] = "en-US,en;q=0.5" resp = http.request(req) case resp when Net::HTTPSuccess ct = resp["content-type"].to_s return nil unless ct.match?(/html|text/i) body = resp.body.to_s body = body.b[0, FETCH_MAX_BYTES].force_encoding("utf-8") body.encode("utf-8", invalid: :replace, undef: :replace, replace: "?") when Net::HTTPRedirection loc = resp["Location"].to_s new_uri = (URI.parse(loc) rescue nil) return nil unless new_uri new_uri = uri + new_uri unless new_uri.absolute? return nil unless new_uri.is_a?(URI::HTTP) || new_uri.is_a?(URI::HTTPS) fetch_follow_redirects(new_uri, limit - 1) end rescue nil end |
#inject_assets_for_html_path?(_relative_path) ⇒ Boolean
── Asset Injection ───────────────────────────────────────────────────
114 115 116 |
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 114 def inject_assets_for_html_path?(_relative_path) true end |
#inject_markdownr_assets(html_content) ⇒ Object
118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 118 def inject_markdownr_assets(html_content) settings.plugins.each { |p| html_content = p.transform_html(html_content, self) } popup_config_script = "<script>var __popupConfig = {" \ "localMd:#{settings.popup_local_md}," \ "localHtml:#{settings.popup_local_html}," \ "external:#{settings.popup_external}," \ "externalDomains:#{settings.popup_external_domains.to_json}" \ "};</script>\n" assets = popup_config_script + File.read(File.join(settings.views, "popup_assets.erb")) inserted = false result = html_content.sub(/<\/(body|html)>/i) { inserted = true; "#{assets}</#{$1}>" } inserted ? result : html_content + assets end |
#page_html(raw, base_url = nil) ⇒ Object
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 91 def page_html(raw, base_url = nil) w = raw.dup STRIP_FULL.each { |t| w.gsub!(/<#{t}[^>]*>.*?<\/#{t}>/im, " ") } w.gsub!(/<!--.*?-->/m, " ") w.gsub!(/Bible\s+Gateway\s+Recommends[\s\S]*?View\s+more\s+titles/i, " ") w.gsub!(/trusted\s+resources\s+beside\s+every\s+verse[\s\S]*?Your\s+Content/i, " ") w.gsub!(/Log\s+In\s*\/\s*Sign\s+Up[\s\S]*?Your\s+Content/i, " ") content = w.match(/<article[^>]*>(.*?)<\/article>/im)&.[](1) || w.match(/<main[^>]*>(.*?)<\/main>/im)&.[](1) || w.match(/<body[^>]*>(.*?)<\/body>/im)&.[](1) || w out = (content, base_url) out.sub!(/\A[\s\S]*?(?=<h[1-6]>)/i, "") out = decode_html_entities(out) out.gsub!(/(<a[^>]*>Read\s+full\s+chapter<\/a>)[\s\S]*?(?=©|Copyright\b)/i, "\\1\n") out.length > 10_000 ? out[0, 10_000] : out end |
#page_title(html) ⇒ Object
── HTML Extraction & Sanitization ────────────────────────────────────
84 85 86 87 88 89 |
# File 'lib/markdown_server/helpers/fetch_helpers.rb', line 84 def page_title(html) html.match(/<title[^>]*>(.*?)<\/title>/im)&.then { |m| m[1].gsub(/<[^>]+>/, "").gsub(/&/i, "&").gsub(/</i, "<") .gsub(/>/i, ">").gsub(/"/i, '"').gsub(/&#?\w+;/, "").strip } || "" end |