Module: URLRewrite

Included in:
WaybackMachineDownloader
Defined in:
lib/wayback_machine_downloader/url_rewrite.rb

Constant Summary collapse

SERVER_SIDE_EXTS =

server-side extensions that should work locally

%w[.php .asp .aspx .jsp .cgi .pl .py].freeze

Instance Method Summary collapse

Instance Method Details

#rewrite_css_urls(content) ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/wayback_machine_downloader/url_rewrite.rb', line 25

def rewrite_css_urls(content)
  # rewrite URLs in CSS
  content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
    path = normalize_path_for_local($1)
    "url(\"#{path}\")"
  end

  # rewrite absolute URLs in CSS
  content.gsub!(/url\(\s*["']?https?:\/\/[^\/]+([^"'\)]*?)["']?\s*\)/i) do
    path = normalize_path_for_local($1)
    "url(\"#{path}\")"
  end

  content
end

#rewrite_html_attr_urls(content) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/wayback_machine_downloader/url_rewrite.rb', line 7

def rewrite_html_attr_urls(content)
  # rewrite URLs to relative paths
  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
    prefix, path, suffix = $1, $2, $3
    path = normalize_path_for_local(path)
    "#{prefix}#{path}#{suffix}"
  end

  # rewrite absolute URLs to same domain as relative
  content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
    prefix, path, suffix = $1, $2, $3
    path = normalize_path_for_local(path)
    "#{prefix}#{path}#{suffix}"
  end

  content
end

#rewrite_js_urls(content) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/wayback_machine_downloader/url_rewrite.rb', line 41

def rewrite_js_urls(content)
  # rewrite archive.org URLs in JavaScript strings
  content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/\d+(?:id_)?\/https?:\/\/[^\/]+([^"']*)(["'])/i) do
    quote_start, path, quote_end = $1, $2, $3
    path = normalize_path_for_local(path)
    "#{quote_start}#{path}#{quote_end}"
  end

  # rewrite absolute URLs in JavaScript
  content.gsub!(/(["'])https?:\/\/[^\/]+([^"']*)(["'])/i) do
    quote_start, path, quote_end = $1, $2, $3
    next "#{quote_start}http#{$2}#{quote_end}" if $2.start_with?('s://', '://')
    path = normalize_path_for_local(path)
    "#{quote_start}#{path}#{quote_end}"
  end

  content
end