Class: Archaeo::UrlRewriter

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/url_rewriter.rb

Overview

Rewrites Wayback Machine archive URLs to local file paths.

Used for saving archived pages and their assets for offline browsing. Converts absolute archive URLs into relative paths rooted at a configurable local directory.

Supports HTML attributes, srcset, inline styles, CSS url(), JavaScript string URLs, and server-side extension handling.

Constant Summary collapse

URL_ATTRS =
%w[src href data-src data-url poster action].freeze
CSS_URL_RE =
/url\(\s*['"]?([^'")\s]+)['"]?\s*\)/
ARCHIVE_RE =
%r{https?://web\.archive\.org/web/\d+(?:id_)?/}
JS_URL_RE =
/['"](https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/[^'"]+)['"]/
SERVER_EXTENSIONS =
%w[.php .asp .aspx .jsp .cgi .pl .py].freeze

Instance Method Summary collapse

Constructor Details

#initialize(archive_prefix, local_prefix, rewrite_js: false, rewrite_absolute: false, server_extensions: false) ⇒ UrlRewriter

Returns a new instance of UrlRewriter.



22
23
24
25
26
27
28
29
30
# File 'lib/archaeo/url_rewriter.rb', line 22

def initialize(archive_prefix, local_prefix,
               rewrite_js: false, rewrite_absolute: false,
               server_extensions: false)
  @archive_prefix = archive_prefix.to_s
  @local_prefix = local_prefix.to_s
  @rewrite_js = rewrite_js
  @rewrite_absolute = rewrite_absolute
  @server_extensions = server_extensions
end

Instance Method Details

#rewrite(url) ⇒ Object



32
33
34
35
36
37
38
39
40
41
# File 'lib/archaeo/url_rewriter.rb', line 32

def rewrite(url)
  if @rewrite_absolute && url.match?(ARCHIVE_RE)
    return rewrite_absolute_url(url)
  end

  return url unless url.start_with?(@archive_prefix)

  relative = url.sub(@archive_prefix, "")
  File.join(@local_prefix, relative)
end

#rewrite_batch(urls) ⇒ Object



43
44
45
# File 'lib/archaeo/url_rewriter.rb', line 43

def rewrite_batch(urls)
  urls.map { |url| rewrite(url) }
end

#rewrite_css(css_content) ⇒ Object



67
68
69
70
71
72
73
74
75
76
# File 'lib/archaeo/url_rewriter.rb', line 67

def rewrite_css(css_content)
  css_content.gsub(CSS_URL_RE) do
    url = Regexp.last_match[1]
    if url.match?(ARCHIVE_RE) || url.start_with?(@archive_prefix)
      "url('#{rewrite(url)}')"
    else
      Regexp.last_match[0]
    end
  end
end

#rewrite_html(html_content) ⇒ Object



47
48
49
50
51
52
53
54
# File 'lib/archaeo/url_rewriter.rb', line 47

def rewrite_html(html_content)
  doc = Nokogiri::HTML(html_content)
  rewrite_url_attrs(doc)
  rewrite_srcset_attrs(doc)
  rewrite_inline_style_attrs(doc)
  rewrite_style_elements(doc)
  doc.to_html
end

#rewrite_js(js_content) ⇒ Object



56
57
58
59
60
61
62
63
64
65
# File 'lib/archaeo/url_rewriter.rb', line 56

def rewrite_js(js_content)
  return js_content unless @rewrite_js

  js_content.gsub(JS_URL_RE) do
    quote = Regexp.last_match[0][0]
    url = Regexp.last_match[1]
    rewritten = rewrite(url)
    "#{quote}#{rewritten}#{quote}"
  end
end

#rewrite_srcset_attrs(doc) ⇒ Object



93
94
95
96
97
# File 'lib/archaeo/url_rewriter.rb', line 93

def rewrite_srcset_attrs(doc)
  doc.css("[srcset]").each do |el|
    el["srcset"] = rewrite_srcset(el["srcset"])
  end
end

#rewrite_url_attrs(doc) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/archaeo/url_rewriter.rb', line 78

def rewrite_url_attrs(doc)
  URL_ATTRS.each do |attr|
    doc.css("[#{attr}]").each do |el|
      original = el[attr]
      next unless original

      if @rewrite_absolute && original.match?(ARCHIVE_RE)
        el[attr] = rewrite_absolute_url(original)
      elsif original.start_with?(@archive_prefix)
        el[attr] = rewrite(original)
      end
    end
  end
end