Class: Archaeo::UrlRewriter

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/url_rewriter.rb

Overview

Rewrites Wayback Machine archive URLs to local file paths.

Used for saving archived pages and their assets for offline browsing. Converts absolute archive URLs into relative paths rooted at a configurable local directory.

Constant Summary collapse

URL_ATTRS =
%w[src href data-src poster].freeze
CSS_URL_RE =
/url\(\s*['"]?([^'")\s]+)['"]?\s*\)/

Instance Method Summary collapse

Constructor Details

#initialize(archive_prefix, local_prefix) ⇒ UrlRewriter

Returns a new instance of UrlRewriter.



15
16
17
18
# File 'lib/archaeo/url_rewriter.rb', line 15

def initialize(archive_prefix, local_prefix)
  @archive_prefix = archive_prefix.to_s
  @local_prefix = local_prefix.to_s
end

Instance Method Details

#rewrite(url) ⇒ Object



20
21
22
23
24
25
# File 'lib/archaeo/url_rewriter.rb', line 20

def rewrite(url)
  return url unless url.start_with?(@archive_prefix)

  relative = url.sub(@archive_prefix, "")
  File.join(@local_prefix, relative)
end

#rewrite_batch(urls) ⇒ Object



27
28
29
# File 'lib/archaeo/url_rewriter.rb', line 27

def rewrite_batch(urls)
  urls.map { |url| rewrite(url) }
end

#rewrite_html(html_content) ⇒ Object



31
32
33
34
35
36
37
38
# File 'lib/archaeo/url_rewriter.rb', line 31

def rewrite_html(html_content)
  doc = Nokogiri::HTML(html_content)
  rewrite_url_attrs(doc)
  rewrite_srcset_attrs(doc)
  rewrite_inline_style_attrs(doc)
  rewrite_style_elements(doc)
  doc.to_html
end

#rewrite_srcset_attrs(doc) ⇒ Object



51
52
53
54
55
# File 'lib/archaeo/url_rewriter.rb', line 51

def rewrite_srcset_attrs(doc)
  doc.css("[srcset]").each do |el|
    el["srcset"] = rewrite_srcset(el["srcset"])
  end
end

#rewrite_url_attrs(doc) ⇒ Object



40
41
42
43
44
45
46
47
48
49
# File 'lib/archaeo/url_rewriter.rb', line 40

def rewrite_url_attrs(doc)
  URL_ATTRS.each do |attr|
    doc.css("[#{attr}]").each do |el|
      original = el[attr]
      next unless original&.start_with?(@archive_prefix)

      el[attr] = rewrite(original)
    end
  end
end