Module: PageRequisites
- Defined in:
- lib/wayback_machine_downloader/page_requisites.rb
Constant Summary collapse
- ASSET_REGEX =
regex to find links in href, src, url(), and srcset this ignores data: URIs, mailto:, and anchors
/(?:href|src|data-src|data-url)\s*=\s*["']([^"']+)["']|url\(\s*["']?([^"'\)]+)["']?\s*\)|srcset\s*=\s*["']([^"']+)["']/i
Class Method Summary collapse
Class Method Details
.extract(html_content) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/wayback_machine_downloader/page_requisites.rb', line 6 def self.extract(html_content) assets = [] html_content.scan(ASSET_REGEX) do |match| # match is an array of capture groups; find the one that matched url = match.compact.first next unless url # handle srcset (e.g. comma separated values like "image.jpg 1x, image2.jpg 2x") if url.include?(',') && (url.include?(' 1x') || url.include?(' 2w')) url.split(',').each do |src_def| src_url = src_def.strip.split(' ').first assets << src_url if valid_asset?(src_url) end else assets << url if valid_asset?(url) end end assets.uniq end |
.valid_asset?(url) ⇒ Boolean
28 29 30 31 32 |
# File 'lib/wayback_machine_downloader/page_requisites.rb', line 28 def self.valid_asset?(url) return false if url.strip.empty? return false if url.start_with?('data:', 'mailto:', '#', 'javascript:') true end |