Module: PageRequisites

Defined in:
lib/wayback_machine_downloader/page_requisites.rb

Constant Summary collapse

ASSET_REGEX =

regex to find links in href, src, url(), and srcset this ignores data: URIs, mailto:, and anchors

/(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i

Class Method Summary collapse

Class Method Details

.extract(html_content) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/wayback_machine_downloader/page_requisites.rb', line 6

def self.extract(html_content)
  assets = []
  
  html_content.scan(ASSET_REGEX) do |match|
    # match is an array of capture groups; find the one that matched
    url = match.compact.first
    next unless url
    
    # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x")
    if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w'))
      url.split(',').each do |src_def|
        src_url = src_def.strip.split(' ').first
        assets << src_url if valid_asset?(src_url)
      end
    else
      assets << url if valid_asset?(url)
    end
  end

  assets.uniq
end

.valid_asset?(url) ⇒ Boolean

Returns:

  • (Boolean)


28
29
30
31
32
# File 'lib/wayback_machine_downloader/page_requisites.rb', line 28

def self.valid_asset?(url)
  return false if url.strip.empty?
  return false if url.start_with?('data:', 'mailto:', '#', 'javascript:')
  true
end