Class: Archaeo::UrlRewriter
- Inherits:
-
Object
- Object
- Archaeo::UrlRewriter
- Defined in:
- lib/archaeo/url_rewriter.rb
Overview
Rewrites Wayback Machine archive URLs to local file paths.
Used for saving archived pages and their assets for offline browsing. Converts absolute archive URLs into relative paths rooted at a configurable local directory.
Supports HTML attributes, srcset, inline styles, CSS url(), JavaScript string URLs, and server-side extension handling.
Constant Summary collapse
- URL_ATTRS =
%w[src href data-src data-url poster action].freeze
- CSS_URL_RE =
/url\(\s*['"]?([^'")\s]+)['"]?\s*\)/- ARCHIVE_RE =
%r{https?://web\.archive\.org/web/\d+(?:id_)?/}- JS_URL_RE =
/['"](https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/[^'"]+)['"]/- SERVER_EXTENSIONS =
%w[.php .asp .aspx .jsp .cgi .pl .py].freeze
Instance Method Summary collapse
-
#initialize(archive_prefix, local_prefix, rewrite_js: false, rewrite_absolute: false, server_extensions: false) ⇒ UrlRewriter
constructor
A new instance of UrlRewriter.
- #rewrite(url) ⇒ Object
- #rewrite_batch(urls) ⇒ Object
- #rewrite_css(css_content) ⇒ Object
- #rewrite_html(html_content) ⇒ Object
- #rewrite_js(js_content) ⇒ Object
- #rewrite_srcset_attrs(doc) ⇒ Object
- #rewrite_url_attrs(doc) ⇒ Object
Constructor Details
#initialize(archive_prefix, local_prefix, rewrite_js: false, rewrite_absolute: false, server_extensions: false) ⇒ UrlRewriter
Returns a new instance of UrlRewriter.
22 23 24 25 26 27 28 29 30 |
# File 'lib/archaeo/url_rewriter.rb', line 22 def initialize(archive_prefix, local_prefix, rewrite_js: false, rewrite_absolute: false, server_extensions: false) @archive_prefix = archive_prefix.to_s @local_prefix = local_prefix.to_s @rewrite_js = rewrite_js @rewrite_absolute = rewrite_absolute @server_extensions = server_extensions end |
Instance Method Details
#rewrite(url) ⇒ Object
32 33 34 35 36 37 38 39 40 41 |
# File 'lib/archaeo/url_rewriter.rb', line 32 def rewrite(url) if @rewrite_absolute && url.match?(ARCHIVE_RE) return rewrite_absolute_url(url) end return url unless url.start_with?(@archive_prefix) relative = url.sub(@archive_prefix, "") File.join(@local_prefix, relative) end |
#rewrite_batch(urls) ⇒ Object
43 44 45 |
# File 'lib/archaeo/url_rewriter.rb', line 43 def rewrite_batch(urls) urls.map { |url| rewrite(url) } end |
#rewrite_css(css_content) ⇒ Object
67 68 69 70 71 72 73 74 75 76 |
# File 'lib/archaeo/url_rewriter.rb', line 67 def rewrite_css(css_content) css_content.gsub(CSS_URL_RE) do url = Regexp.last_match[1] if url.match?(ARCHIVE_RE) || url.start_with?(@archive_prefix) "url('#{rewrite(url)}')" else Regexp.last_match[0] end end end |
#rewrite_html(html_content) ⇒ Object
47 48 49 50 51 52 53 54 |
# File 'lib/archaeo/url_rewriter.rb', line 47 def rewrite_html(html_content) doc = Nokogiri::HTML(html_content) rewrite_url_attrs(doc) rewrite_srcset_attrs(doc) rewrite_inline_style_attrs(doc) rewrite_style_elements(doc) doc.to_html end |
#rewrite_js(js_content) ⇒ Object
56 57 58 59 60 61 62 63 64 65 |
# File 'lib/archaeo/url_rewriter.rb', line 56 def rewrite_js(js_content) return js_content unless @rewrite_js js_content.gsub(JS_URL_RE) do quote = Regexp.last_match[0][0] url = Regexp.last_match[1] rewritten = rewrite(url) "#{quote}#{rewritten}#{quote}" end end |
#rewrite_srcset_attrs(doc) ⇒ Object
93 94 95 96 97 |
# File 'lib/archaeo/url_rewriter.rb', line 93 def rewrite_srcset_attrs(doc) doc.css("[srcset]").each do |el| el["srcset"] = rewrite_srcset(el["srcset"]) end end |
#rewrite_url_attrs(doc) ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/archaeo/url_rewriter.rb', line 78 def rewrite_url_attrs(doc) URL_ATTRS.each do |attr| doc.css("[#{attr}]").each do |el| original = el[attr] next unless original if @rewrite_absolute && original.match?(ARCHIVE_RE) el[attr] = rewrite_absolute_url(original) elsif original.start_with?(@archive_prefix) el[attr] = rewrite(original) end end end end |