Class: WaybackMachineDownloader

Inherits:

Object

Object
WaybackMachineDownloader

show all

Includes:: ArchiveAPI, SubdomainProcessor, URLRewrite

Defined in:: lib/wayback_machine_downloader.rb

Constant Summary collapse

VERSION =

"2.4.7"

DEFAULT_TIMEOUT =

MAX_RETRIES =

RETRY_DELAY =

RATE_LIMIT = Delay between requests in seconds

0.25

CONNECTION_POOL_SIZE =

MEMORY_BUFFER_SIZE = 16KB chunks

STATE_CDX_FILENAME =

".cdx.json"

STATE_DB_FILENAME =

".downloaded.txt"

Constants included from URLRewrite

URLRewrite::SERVER_SIDE_EXTS

Instance Attribute Summary collapse

#all ⇒ Object

Returns the value of attribute all.
#all_timestamps ⇒ Object

Returns the value of attribute all_timestamps.
#base_url ⇒ Object

Returns the value of attribute base_url.
#directory ⇒ Object

Returns the value of attribute directory.
#exact_url ⇒ Object

Returns the value of attribute exact_url.
#exclude_filter ⇒ Object

Returns the value of attribute exclude_filter.
#from_timestamp ⇒ Object

Returns the value of attribute from_timestamp.
#keep ⇒ Object

Returns the value of attribute keep.
#keep_duplicates ⇒ Object

Returns the value of attribute keep_duplicates.
#logger ⇒ Object

Returns the value of attribute logger.
#maximum_pages ⇒ Object

Returns the value of attribute maximum_pages.
#only_filter ⇒ Object

Returns the value of attribute only_filter.
#page_requisites ⇒ Object

Returns the value of attribute page_requisites.
#reset ⇒ Object

Returns the value of attribute reset.
#rewrite ⇒ Object

Returns the value of attribute rewrite.
#snapshot_at ⇒ Object

Returns the value of attribute snapshot_at.
#threads_count ⇒ Object

Returns the value of attribute threads_count.
#to_timestamp ⇒ Object

Returns the value of attribute to_timestamp.

Instance Method Summary collapse

#append_to_db(file_id) ⇒ Object
#backup_name ⇒ Object
#backup_path ⇒ Object
#cdx_path ⇒ Object
#color(text, color_code) ⇒ Object
#db_path ⇒ Object
#download_file(file_remote_info, http) ⇒ Object
#download_files ⇒ Object
#file_list_by_timestamp ⇒ Object
#file_queue ⇒ Object
#get_all_snapshots_to_consider ⇒ Object
#get_composite_snapshot_file_list(target_timestamp) ⇒ Object

Get a composite snapshot file list for a specific timestamp.
#get_file_list_all_timestamps ⇒ Object
#get_file_list_by_timestamp ⇒ Object
#get_file_list_composite_snapshot(target_timestamp) ⇒ Object

Returns a list of files for the composite snapshot.
#get_file_list_curated ⇒ Object
#handle_reset ⇒ Object
#initialize(params) ⇒ WaybackMachineDownloader constructor

A new instance of WaybackMachineDownloader.
#list_files ⇒ Object
#load_downloaded_ids ⇒ Object
#local_path_for_file_id(file_id) ⇒ Object

derive the local filesystem path for a sanitized ‘file_id` stored in the DB.
#match_exclude_filter(file_url) ⇒ Object
#match_only_filter(file_url) ⇒ Object
#process_page_requisites(file_path, parent_remote_info) ⇒ Object
#process_single_file(file_remote_info) ⇒ Object
#processing_files(pool, files_to_process) ⇒ Object
#rewrite_local_files ⇒ Object
#rewrite_urls_to_relative(file_path) ⇒ Object
#structure_dir_path(dir_path) ⇒ Object
#submit_download_job(file_remote_info) ⇒ Object

helper to submit jobs and increment the counter.

Constructor Details

#initialize(params) ⇒ `WaybackMachineDownloader`

Returns a new instance of WaybackMachineDownloader.

# File 'lib/wayback_machine_downloader.rb', line 149

def initialize params
  validate_params(params)
  @base_url = params[:base_url]&.tidy_bytes
  @exact_url = params[:exact_url]
  if params[:directory]
    sanitized_dir = params[:directory].tidy_bytes
    @directory = File.expand_path(sanitized_dir)
  else
    @directory = nil
  end
  @all_timestamps = params[:all_timestamps]
  @from_timestamp = params[:from_timestamp].to_i
  @to_timestamp = params[:to_timestamp].to_i
  @only_filter = params[:only_filter]
  @exclude_filter = params[:exclude_filter]
  @all = params[:all]
  @keep_duplicates = params[:keep_duplicates] || false
  @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
  @threads_count = [params[:threads_count].to_i, 1].max
  @rewritten = params[:rewritten]
  @reset = params[:reset]
  @keep = params[:keep]
  @timeout = params[:timeout] || DEFAULT_TIMEOUT
  @logger = setup_logger
  @failed_downloads = Concurrent::Array.new
  @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
  @db_mutex = Mutex.new
  @rewrite = params[:rewrite] || false
  @recursive_subdomains = params[:recursive_subdomains] || false
  @subdomain_depth = params[:subdomain_depth] || 1
  @snapshot_at = params[:snapshot_at] ? params[:snapshot_at].to_i : nil
  @max_retries = params[:max_retries] ? params[:max_retries].to_i : MAX_RETRIES
  @page_requisites = params[:page_requisites] || false
  @pending_jobs = Concurrent::AtomicFixnum.new(0)

  # URL for rejecting invalid/unencoded wayback urls
  @url_regexp = /^(([A-Za-z][A-Za-z0-9+.-]*):((\/\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=]))+)(:([0-9]*))?)(((\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))|((\/(((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)?))|((((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)+)(\/((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)*))*)))(\?((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?(\#((([A-Za-z0-9._~-])|(%[ABCDEFabcdef0-9][ABCDEFabcdef0-9])|([!$&'('')'*+,;=])|:|@)|\/|\?)*)?)$/

  handle_reset
end

Instance Attribute Details

#all ⇒ `Object`

Returns the value of attribute all.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def all
  @all
end

#all_timestamps ⇒ `Object`

Returns the value of attribute all_timestamps.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def all_timestamps
  @all_timestamps
end

#base_url ⇒ `Object`

Returns the value of attribute base_url.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def base_url
  @base_url
end

#directory ⇒ `Object`

Returns the value of attribute directory.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def directory
  @directory
end

#exact_url ⇒ `Object`

Returns the value of attribute exact_url.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def exact_url
  @exact_url
end

#exclude_filter ⇒ `Object`

Returns the value of attribute exclude_filter.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def exclude_filter
  @exclude_filter
end

#from_timestamp ⇒ `Object`

Returns the value of attribute from_timestamp.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def from_timestamp
  @from_timestamp
end

#keep ⇒ `Object`

Returns the value of attribute keep.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def keep
  @keep
end

#keep_duplicates ⇒ `Object`

Returns the value of attribute keep_duplicates.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def keep_duplicates
  @keep_duplicates
end

#logger ⇒ `Object`

Returns the value of attribute logger.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def logger
  @logger
end

#maximum_pages ⇒ `Object`

Returns the value of attribute maximum_pages.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def maximum_pages
  @maximum_pages
end

#only_filter ⇒ `Object`

Returns the value of attribute only_filter.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def only_filter
  @only_filter
end

#page_requisites ⇒ `Object`

Returns the value of attribute page_requisites.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def page_requisites
  @page_requisites
end

#reset ⇒ `Object`

Returns the value of attribute reset.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def reset
  @reset
end

#rewrite ⇒ `Object`

Returns the value of attribute rewrite.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def rewrite
  @rewrite
end

#snapshot_at ⇒ `Object`

Returns the value of attribute snapshot_at.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def snapshot_at
  @snapshot_at
end

#threads_count ⇒ `Object`

Returns the value of attribute threads_count.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def threads_count
  @threads_count
end

#to_timestamp ⇒ `Object`

Returns the value of attribute to_timestamp.



144
145
146

# File 'lib/wayback_machine_downloader.rb', line 144

def to_timestamp
  @to_timestamp
end

Instance Method Details

#append_to_db(file_id) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 547

def append_to_db(file_id)
  @db_mutex.synchronize do
    begin
      FileUtils.mkdir_p(File.dirname(db_path))
      File.open(db_path, 'a') { |f| f.puts(file_id) }
    rescue => e
      @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
    end
  end
end

#backup_name ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 190

def backup_name
  url_to_process = @base_url
  url_to_process = url_to_process.chomp('/*') if url_to_process&.end_with?('/*')

  raw = if url_to_process.include?('//')
    url_to_process.split('/')[2]
  else
    url_to_process
  end

  # if it looks like a wildcard pattern, normalize to a safe host-ish name
  if raw&.start_with?('*.')
    raw = raw.sub(/\A\*\./, 'all-')
  end

  # sanitize for Windows (and safe cross-platform) to avoid ENOTDIR on mkdir (colon in host:port)
  if Gem.win_platform?
    raw = raw.gsub(/[:*?"<>|]/, '_')
    raw = raw.gsub(/[ .]+\z/, '')
  else
    # still good practice to strip path separators (and maybe '*' for POSIX too)
    raw = raw.gsub(/[\/:*?"<>|]/, '_')
  end

  raw = 'site' if raw.nil? || raw.empty?
  raw
end

#backup_path ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 218

def backup_path
  if @directory
    # because @directory is already an absolute path, we just ensure it exists
    @directory
  else
    # ensure the default path is absolute and normalized
    cwd = Dir.pwd
    File.expand_path(File.join(cwd, 'websites', backup_name))
  end
end

#cdx_path ⇒ `Object`



229
230
231

# File 'lib/wayback_machine_downloader.rb', line 229

def cdx_path
  File.join(backup_path, STATE_CDX_FILENAME)
end

#color(text, color_code) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 1038

def color(text, color_code)
  return text if Gem.win_platform? && !ENV['ENABLE_ANSI']
  codes = { red: 31, green: 32, yellow: 33, blue: 34, magenta: 35, cyan: 36, white: 37 }
  "\e[#{codes[color_code]}m#{text}\e[0m"
end

#db_path ⇒ `Object`



233
234
235

# File 'lib/wayback_machine_downloader.rb', line 233

def db_path
  File.join(backup_path, STATE_DB_FILENAME)
end

#download_file(file_remote_info, http) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 948

def download_file (file_remote_info, http)
  current_encoding = "".encoding
  file_url = file_remote_info[:file_url].encode(current_encoding)
  file_id = file_remote_info[:file_id]
  file_timestamp = file_remote_info[:timestamp]
  
  # sanitize file_id to ensure it is a valid path component
  raw_path_elements = file_id.split('/')

  sanitized_path_elements = raw_path_elements.map do |element|
    if Gem.win_platform?
      # for Windows, we need to sanitize path components to avoid invalid characters
      # this prevents issues with file names that contain characters not allowed in
      # Windows file systems. See # https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions
      element.gsub(/[:\*?"<>\|\&\=\/\\]/) { |match| '%' + match.ord.to_s(16).upcase }
    else
      element
    end
  end
  
  current_backup_path = backup_path

  if file_id == ""
    dir_path = current_backup_path
    file_path = File.join(dir_path, 'index.html')
  elsif file_url[-1] == '/' || (sanitized_path_elements.last && !sanitized_path_elements.last.include?('.'))
    # if file_id is a directory, we treat it as such
    dir_path = File.join(current_backup_path, *sanitized_path_elements)
    file_path = File.join(dir_path, 'index.html')
  else
    # if file_id is a file, we treat it as such
    filename = sanitized_path_elements.pop
    dir_path = File.join(current_backup_path, *sanitized_path_elements)
    file_path = File.join(dir_path, filename)
  end

  # check existence *before* download attempt
  # this handles cases where a file was created manually or by a previous partial run without a .db entry
  if File.exist? file_path
     return ["#{color("[EXISTS]", :cyan)} #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
  end

  begin
    structure_dir_path dir_path
    status = download_with_retry(file_path, file_url, file_timestamp, http)

    case status
    when :saved
      if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
        rewrite_urls_to_relative(file_path)
      end
      return ["#{color("[SAVED]", :green)}  #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", file_path]
    when :skipped_not_found
      return ["#{color("[NOT FOUND]", :yellow)} #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
    else
      # ideally, this case should not be reached if download_with_retry behaves as expected.
      # ideally, this case should not be reached if download_with_retry behaves as expected.
      return ["#{color("[UNKNOWN]", :magenta)} #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})", nil]
    end
  rescue StandardError => e
    msg = "#{color("[FAILED]", :red)}  #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
    if File.exist?(file_path) and File.size(file_path) == 0
      File.delete(file_path)
      msg += "\n#{file_path} was empty and was removed."
    end
    return [msg, nil]
  end
end

#download_files ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 591

def download_files
  start_time = Time.now
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."

  FileUtils.mkdir_p(backup_path)

  # Load the list of files to potentially download
  files_to_download = file_list_by_timestamp

  if files_to_download.empty?
    puts "No files found matching criteria."
    cleanup
    return
  end

  total_files = files_to_download.count
  puts "#{total_files} files found matching criteria."

  # Load IDs of already downloaded files
  downloaded_ids = load_downloaded_ids
  
  # We use a thread-safe Set to track what we have queued/downloaded in this session
  # to avoid infinite loops with page requisites
  @session_downloaded_ids = Concurrent::Set.new
  downloaded_ids.each { |id| @session_downloaded_ids.add(id) }

  files_to_process = files_to_download.reject do |file_info|
    downloaded_ids.include?(file_info[:file_id])
  end

  remaining_count = files_to_process.count
  skipped_count = total_files - remaining_count

  if skipped_count > 0
    puts "Found #{skipped_count} previously downloaded files, skipping them."
  end
  
  if remaining_count == 0 && !@page_requisites
    puts "All matching files have already been downloaded."
    cleanup
    return
  end

  puts "#{remaining_count} files to download."

  @processed_file_count = 0
  @total_to_download = remaining_count
  @download_mutex = Mutex.new

  thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
  @worker_pool = Concurrent::FixedThreadPool.new(thread_count)

  # initial batch
  files_to_process.each do |file_remote_info|
    @session_downloaded_ids.add(file_remote_info[:file_id])
    submit_download_job(file_remote_info)
  end

  # print a header for the download phase
  puts "\n#{color("Processing downloads:", :white)}"
  $stdout.flush

  # wait for all jobs to finish
  loop do
    sleep 0.5
    break if @pending_jobs.value == 0
  end

  @worker_pool.shutdown
  @worker_pool.wait_for_termination

  end_time = Time.now
  puts "\nDownload finished in #{(end_time - start_time).round(2)}s."

  # process subdomains if enabled
  if @recursive_subdomains
    subdomain_start_time = Time.now
    process_subdomains
    subdomain_end_time = Time.now
    subdomain_time = (subdomain_end_time - subdomain_start_time).round(2)
    puts "Subdomain processing finished in #{subdomain_time}s."
  end

  puts "Results saved in #{backup_path}"
  cleanup
end

#file_list_by_timestamp ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 1048

def file_list_by_timestamp
  if @snapshot_at
    @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
  elsif @all_timestamps
    file_list_curated = get_file_list_all_timestamps
    file_list_curated.map do |file_remote_info|
      file_remote_info[1][:file_id] = file_remote_info[0]
      file_remote_info[1]
    end
  else
    file_list_curated = get_file_list_curated
    file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
    file_list_curated.map do |file_remote_info|
      file_remote_info[1][:file_id] = file_remote_info[0]
      file_remote_info[1]
    end
  end
end

#file_queue ⇒ `Object`



1044
1045
1046

# File 'lib/wayback_machine_downloader.rb', line 1044

def file_queue
  @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
end

#get_all_snapshots_to_consider ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 272

def get_all_snapshots_to_consider
  if File.exist?(cdx_path) && !@reset
    puts "Loading snapshot list from #{cdx_path}"
    begin
      snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
      puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
      puts
      return Concurrent::Array.new(snapshot_list_to_consider)
    rescue JSON::ParserError => e
      puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
      FileUtils.rm_f(cdx_path)
    rescue => e
      puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
      FileUtils.rm_f(cdx_path)
    end
  end

  snapshot_list_to_consider = Concurrent::Array.new
  mutex = Mutex.new

  # if snapshot_at is set, limit CDX queries to snapshots at or before that timestamp
  original_to = @to_timestamp
  if @snapshot_at
    @to_timestamp = @snapshot_at
  end

  puts "Getting snapshot pages from Wayback Machine API..."

  # Fetch the initial set of snapshots, sequentially
  @connection_pool.with_connection do |connection|
    initial_list = get_raw_list_from_api(@base_url, 0, connection)
    initial_list ||= []
    mutex.synchronize do
      snapshot_list_to_consider.concat(initial_list)
      print "."
      $stdout.flush
    end
  end

  # Fetch additional pages if the exact URL flag is not set and the first page wasn't empty
  unless @exact_url || snapshot_list_to_consider.empty?
    page_index = 1
    batch_size = [@threads_count, 5].min
    continue_fetching = true
    fetch_pool = Concurrent::FixedThreadPool.new([@threads_count, 1].max)
    begin
      while continue_fetching && page_index < @maximum_pages
        # Determine the range of pages to fetch in this batch
        end_index = [page_index + batch_size, @maximum_pages].min
        current_batch = (page_index...end_index).to_a

        # Create futures for concurrent API calls
        futures = current_batch.map do |page|
          Concurrent::Future.execute(executor: fetch_pool) do
            result = nil
            @connection_pool.with_connection do |connection|
              result = get_raw_list_from_api(@base_url, page, connection)
            end
            result ||= []
            [page, result]
          end
        end

        results = []

        futures.each do |future|
          begin
            val = future.value
            # only append if valid
            if val && val.is_a?(Array) && val.first.is_a?(Integer)
              results << val
            end
          rescue => e
            puts "\nError fetching page #{future}: #{e.message}"
          end
        end

        # Sort results by page number to maintain order
        results.sort_by! { |page, _| page }

        # Process results and check for empty pages
        results.each do |page, result|
          if result.nil? || result.empty?
            continue_fetching = false
            break
          else
            mutex.synchronize do
              snapshot_list_to_consider.concat(result)
              print "."
              $stdout.flush
            end
          end
        end

        page_index = end_index

        sleep(RATE_LIMIT) if continue_fetching
      end
    ensure
      fetch_pool.shutdown
      fetch_pool.wait_for_termination
    end
  end

  puts " found #{snapshot_list_to_consider.length} snapshots."

  # save the fetched list to the cache file
  begin
    FileUtils.mkdir_p(File.dirname(cdx_path))
    File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
    puts "Saved snapshot list to #{cdx_path}"
  rescue => e
    puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
  ensure
    # restore any previously set to-timestamp
    @to_timestamp = original_to
  end
  puts

  snapshot_list_to_consider
end

#get_composite_snapshot_file_list(target_timestamp) ⇒ `Object`

Get a composite snapshot file list for a specific timestamp

# File 'lib/wayback_machine_downloader.rb', line 395

def get_composite_snapshot_file_list(target_timestamp)
  file_versions = {}
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
    next unless file_url.include?('/')
    next if file_timestamp.to_i > target_timestamp

    # allow empty path by treating missing tail as empty string
    raw_tail = file_url.split('/')[3..-1]&.join('/') || ''
    file_id = sanitize_and_prepare_id(raw_tail, file_url)
    next if file_id.nil?
    next if match_exclude_filter(file_url)
    next unless match_only_filter(file_url)

    if !file_versions[file_id] || file_versions[file_id][:timestamp].to_i < file_timestamp.to_i
      file_versions[file_id] = { file_url: file_url, timestamp: file_timestamp, file_id: file_id }
    end
  end
  file_versions.values
end

#get_file_list_all_timestamps ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 453

def get_file_list_all_timestamps
  file_list_curated = Hash.new
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
    next unless file_url.include?('/')

    raw_tail = file_url.split('/')[3..-1]&.join('/') || ''
    file_id = sanitize_and_prepare_id(raw_tail, file_url)
    if file_id.nil?
      puts "Malformed file url, ignoring: #{file_url}"
      next
    end

    file_id_and_timestamp_raw = [file_timestamp, file_id].join('/')
    file_id_and_timestamp = sanitize_and_prepare_id(file_id_and_timestamp_raw, file_url)
    if file_id_and_timestamp.nil?
      puts "Malformed file id/timestamp combo, ignoring: #{file_url}"
      next
    end

    if file_id_and_timestamp.include?('<') || file_id_and_timestamp.include?('>')
      puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
    else
      if match_exclude_filter(file_url)
        puts "File url matches exclude filter, ignoring: #{file_url}"
      elsif !match_only_filter(file_url)
        puts "File url doesn't match only filter, ignoring: #{file_url}"
      elsif file_list_curated[file_id_and_timestamp]
        # duplicate combo, ignore silently (verbose flag not shown here)
      else
        file_list_curated[file_id_and_timestamp] = { file_url: file_url, timestamp: file_timestamp }
      end
    end
  end
  puts "file_list_curated: " + file_list_curated.count.to_s
  file_list_curated
end

#get_file_list_by_timestamp ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 491

def get_file_list_by_timestamp
  if @snapshot_at
    @file_list_by_snapshot_at ||= get_composite_snapshot_file_list(@snapshot_at)
  elsif @all_timestamps
    file_list_curated = get_file_list_all_timestamps
    file_list_curated.map do |file_remote_info|
      file_remote_info[1][:file_id] = file_remote_info[0]
      file_remote_info[1]
    end
  else
    file_list_curated = get_file_list_curated
    file_list_curated = file_list_curated.sort_by { |_,v| v[:timestamp].to_s }.reverse
    file_list_curated.map do |file_remote_info|
      file_remote_info[1][:file_id] = file_remote_info[0]
      file_remote_info[1]
    end
  end
end

#get_file_list_composite_snapshot(target_timestamp) ⇒ `Object`

Returns a list of files for the composite snapshot

# File 'lib/wayback_machine_downloader.rb', line 416

def get_file_list_composite_snapshot(target_timestamp)
  file_list = get_composite_snapshot_file_list(target_timestamp)
  # return a list sorted newest->oldest by timestamp
  file_list.sort_by { |v| v[:timestamp].to_s }.reverse
end

#get_file_list_curated ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 422

def get_file_list_curated
  file_list_curated = Hash.new
  get_all_snapshots_to_consider.each do |file_timestamp, file_url|
    next unless file_url.include?('/')

    raw_tail = file_url.split('/')[3..-1]&.join('/') || ''
    file_id = sanitize_and_prepare_id(raw_tail, file_url)
    if file_id.nil?
      puts "Malformed file url, ignoring: #{file_url}"
      next
    end

    if file_id.include?('<') || file_id.include?('>')
      puts "Invalid characters in file_id after sanitization, ignoring: #{file_url}"
    else
      if match_exclude_filter(file_url)
        puts "File url matches exclude filter, ignoring: #{file_url}"
      elsif !match_only_filter(file_url)
        puts "File url doesn't match only filter, ignoring: #{file_url}"
      elsif file_list_curated[file_id]
        unless file_list_curated[file_id][:timestamp] > file_timestamp
          file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
        end
      else
        file_list_curated[file_id] = { file_url: file_url, timestamp: file_timestamp }
      end
    end
  end
  file_list_curated
end

#handle_reset ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 237

def handle_reset
  if @reset
    puts "Resetting download state..."
    FileUtils.rm_f(cdx_path)
    FileUtils.rm_f(db_path)
    puts "Removed state files: #{cdx_path}, #{db_path}"
  end
end

#list_files ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 510

def list_files
  # retrieval produces its own output
  @orig_stdout = $stdout
  $stdout = $stderr
  files = get_file_list_by_timestamp
  $stdout = @orig_stdout
  puts "["
  files[0...-1].each do |file|
    puts file.to_json + ","
  end
  puts files[-1].to_json
  puts "]"
end

#load_downloaded_ids ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 524

def load_downloaded_ids
  downloaded_ids = Set.new
  if File.exist?(db_path) && !@reset
    puts "Loading list of already downloaded files from #{db_path}"
    begin
      File.foreach(db_path) do |line|
        id = line.strip
        # only trust DB entries that actually exist on disk; this helps when resuming
        path = local_path_for_file_id(id)
        if path && File.exist?(path)
          downloaded_ids.add(id)
        else
          puts "Found DB entry but file missing, will requeue: #{id}" if @logger && @logger.level == Logger::DEBUG
        end
      end
    rescue => e
      puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
      downloaded_ids.clear
    end
  end
  downloaded_ids
end

#local_path_for_file_id(file_id) ⇒ `Object`

derive the local filesystem path for a sanitized ‘file_id` stored in the DB

# File 'lib/wayback_machine_downloader.rb', line 1018

def local_path_for_file_id(file_id)
  return nil if file_id.nil?
  current_backup_path = backup_path

  # file_id coming from DB is expected to already be sanitized
  raw_path_elements = file_id.split('/')

  if file_id == ""
    dir_path = current_backup_path
    return File.join(dir_path, 'index.html')
  elsif file_id[-1] == '/' || (raw_path_elements.last && !raw_path_elements.last.include?('.'))
    dir_path = File.join(current_backup_path, *raw_path_elements)
    return File.join(dir_path, 'index.html')
  else
    filename = raw_path_elements.pop
    dir_path = File.join(current_backup_path, *raw_path_elements)
    return File.join(dir_path, filename)
  end
end

#match_exclude_filter(file_url) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 259

def match_exclude_filter file_url
  if @exclude_filter
    exclude_filter_regex = @exclude_filter.to_regex(detect: true)
    if exclude_filter_regex
      exclude_filter_regex =~ file_url
    else
      file_url.downcase.include? @exclude_filter.downcase
    end
  else
    false
  end
end

#match_only_filter(file_url) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 246

def match_only_filter file_url
  if @only_filter
    only_filter_regex = @only_filter.to_regex(detect: true)
    if only_filter_regex
      only_filter_regex =~ file_url
    else
      file_url.downcase.include? @only_filter.downcase
    end
  else
    true
  end
end

#process_page_requisites(file_path, parent_remote_info) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 736

def process_page_requisites(file_path, parent_remote_info)
  return unless File.exist?(file_path)

  content = File.read(file_path)
  content = content.force_encoding('UTF-8').scrub

  assets = PageRequisites.extract(content)

  # prepare base URI for resolving relative paths
  parent_raw = parent_remote_info[:file_url]
  parent_raw = "http://#{parent_raw}" unless parent_raw.match?(/^https?:\/\//)
  
  begin
    base_uri = URI(parent_raw)
    # calculate the "root" host of the site we are downloading to compare later
    current_project_host = URI("http://" + @base_url.gsub(%r{^https?://}, '')).host
  rescue URI::InvalidURIError
    return
  end

  parent_timestamp = parent_remote_info[:timestamp]

  assets.each do |asset_rel_url|
    begin
      # resolve full URL (handles relative paths like "../img/logo.png")
      resolved_uri = base_uri + asset_rel_url

      # detect if the asset URL is already a Wayback "web/<timestamp>/.../https://..." embed
      asset_timestamp = parent_timestamp
      if resolved_uri.path =~ %r{\A/web/([0-9]{4,})[^/]*/(https?://.+)\z}
        embedded_ts = $1
        begin
          orig_uri = URI($2)
          resolved_uri = orig_uri
          asset_timestamp = embedded_ts.to_i
        rescue URI::InvalidURIError
          # fall back to original resolved_uri and parent timestamp
        end
      end

      # filter out navigation links (pages) vs assets
      # skip if extension is empty or looks like an HTML page
      path = resolved_uri.path
      ext = File.extname(path).downcase
      if ext.empty? || ['.html', '.htm', '.php', '.asp', '.aspx'].include?(ext)
         next 
      end

      # construct the original URL to query the Wayback API
      asset_wbm_url = if resolved_uri.scheme
                        "#{resolved_uri.scheme}://#{resolved_uri.host}#{resolved_uri.path}"
                      else
                        "#{resolved_uri.host}#{resolved_uri.path}"
                      end
      asset_wbm_url += "?#{resolved_uri.query}" if resolved_uri.query

      # attempt to find the best snapshot timestamp for this asset to increase hit rate
      begin
        @connection_pool.with_connection do |connection|
          snapshots = get_raw_list_from_api(asset_wbm_url, 0, connection)
          if snapshots && snapshots.any?
            # choose the most recent snapshot at or before the
            # parent page timestamp, else the latest available
            chosen = snapshots.select { |ts, _| ts.to_i <= parent_timestamp.to_i }.max_by { |s| s[0].to_i } || snapshots.max_by { |s| s[0].to_i }
            asset_timestamp = chosen[0].to_i if chosen
          end
        end
      rescue => e
        @logger.warn("Failed to query CDX for #{asset_wbm_url}: #{e.message}") if @logger
      end

      # construct the local file ID
      #  if the asset is on the SAME domain, strip the domain from the folder path
      #  if it's on a DIFFERENT domain (e.g. cdn.jquery.com), keep the domain folder
      if resolved_uri.host == current_project_host
         # e.g. /static/css/style.css
         asset_file_id = resolved_uri.path
         asset_file_id = asset_file_id[1..-1] if asset_file_id.start_with?('/')
      else
         # e.g. cdn.google.com/jquery.js
         asset_file_id = asset_wbm_url
      end

    rescue URI::InvalidURIError, StandardError
      next
    end

    # sanitize and queue
    asset_id = sanitize_and_prepare_id(asset_file_id, asset_wbm_url)

    unless @session_downloaded_ids.include?(asset_id)
      @session_downloaded_ids.add(asset_id)

      new_file_info = {
        file_url: asset_wbm_url,
        timestamp: asset_timestamp,
        file_id: asset_id
      }

      @download_mutex.synchronize do
        @total_to_download += 1
        puts "Queued requisite: #{asset_file_id} (#{asset_wbm_url} @ #{asset_timestamp})"
      end

      submit_download_job(new_file_info)
    end
  end
end

#process_single_file(file_remote_info) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 690

def process_single_file(file_remote_info)
  download_success = false
  downloaded_path = nil

  # fast-path for resumed runs: if file already exists locally, avoid HTTP work entirely
  existing_path = local_path_for_file_id(file_remote_info[:file_id])
  if existing_path && File.exist?(existing_path)
    result_message = "#{color("[EXISTS]", :cyan)} #{file_remote_info[:file_url]} (#{@processed_file_count + 1}/#{@total_to_download})"
    @download_mutex.synchronize do
      @processed_file_count += 1 if @processed_file_count < @total_to_download
      puts result_message
    end

    append_to_db(file_remote_info[:file_id])

    if @page_requisites && File.extname(existing_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
      process_page_requisites(existing_path, file_remote_info)
    end
    return
  end
  
  @connection_pool.with_connection do |connection|
    result_message, downloaded_path = download_file(file_remote_info, connection)
    
    if downloaded_path && File.exist?(downloaded_path)
       download_success = true
    end
    
    @download_mutex.synchronize do
      @processed_file_count += 1 if @processed_file_count < @total_to_download
      # only print if it's a "User" file or a requisite we found
      puts result_message if result_message
    end
  end

  if download_success
    append_to_db(file_remote_info[:file_id])
    
    if @page_requisites && downloaded_path && File.extname(downloaded_path) =~ /\.(html?|php|asp|aspx|jsp)$/i
      process_page_requisites(downloaded_path, file_remote_info)
    end
  end
rescue => e
  @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
end

#processing_files(pool, files_to_process) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 558

def processing_files(pool, files_to_process)
  files_to_process.each do |file_remote_info|
    pool.post do
      download_success = false
      begin
        @connection_pool.with_connection do |connection|
          result_message, downloaded_path = download_file(file_remote_info, connection)
          # consider the download successful if we have a downloaded path present
          if downloaded_path && File.exist?(downloaded_path)
             download_success = true
          end
          @download_mutex.synchronize do
            @processed_file_count += 1
            # adjust progress message to reflect remaining files
            progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
            puts progress_message if progress_message
          end
        end
        # sppend to DB only after successful download outside the connection block
        if download_success
          append_to_db(file_remote_info[:file_id])
        end
      rescue => e
        @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
         @download_mutex.synchronize do
            @processed_file_count += 1
         end
      end
      sleep(RATE_LIMIT)
    end
  end
end

#rewrite_local_files ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 881

def rewrite_local_files
  puts "Scanning #{backup_path} for files to rewrite..."
  files = Dir.glob(File.join(backup_path, "**/*.{html,htm,css,js,php,asp,aspx,jsp}"))
  
  puts "Found #{files.size} files. Rewriting links for local browsing..."
  
  pool = Concurrent::FixedThreadPool.new(@threads_count)
  progress = Concurrent::AtomicFixnum.new(0)
  
  files.each do |file_path|
    pool.post do
      rewrite_urls_to_relative(file_path)
      current = progress.increment
      print "\rProgress: #{current}/#{files.size}" if current % 100 == 0
    end
  end
  
  pool.shutdown
  pool.wait_for_termination
  puts "\nFinished rewriting all files."
end

#rewrite_urls_to_relative(file_path) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 903

def rewrite_urls_to_relative(file_path)
  return unless File.exist?(file_path)
  
  file_ext = File.extname(file_path).downcase
  
  begin
    content = File.binread(file_path)

    # detect encoding for HTML files
    if file_ext == '.html' || file_ext == '.htm' || file_ext == '.php' || file_ext == '.asp'
      encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
      content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
    else
      content.force_encoding('UTF-8')
    end

    # URLs in HTML attributes
    content = rewrite_html_attr_urls(content)
    
    # URLs in CSS
    content = rewrite_css_urls(content)
    
    # URLs in JavaScript
    content = rewrite_js_urls(content)
    
    # for URLs that start with a single slash, make them relative
    content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
      prefix, path, suffix = $1, $2, $3
      "#{prefix}./#{path}#{suffix}"
    end
    
    # for URLs in CSS that start with a single slash, make them relative
    content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
      path = $1
      "url(\"./#{path}\")"
    end

    # save the modified content back to the file
    File.binwrite(file_path, content)
    puts "Rewrote URLs in #{file_path} to be relative."
  rescue Errno::ENOENT => e
    @logger.warn("Error reading file #{file_path}: #{e.message}")
  end
end

#structure_dir_path(dir_path) ⇒ `Object`

# File 'lib/wayback_machine_downloader.rb', line 845

def structure_dir_path dir_path
  begin
    # check if it's already a directory; if not, try to create it
    FileUtils::mkdir_p dir_path unless File.directory? dir_path
  rescue Errno::EEXIST, Errno::ENOTDIR => e
    file_already_existing = nil
    check_path = dir_path
    
    # walk up the path to find the specific file that is blocking directory creation
    while check_path != "." && check_path != "/"
      if File.exist?(check_path) && !File.directory?(check_path)
        file_already_existing = check_path
        break
      end
      parent = File.dirname(check_path)
      break if parent == check_path
      check_path = parent
    end

    if file_already_existing
      file_already_existing_temporary = file_already_existing + '.temp'
      file_already_existing_permanent = file_already_existing + '/index.html'
      
      FileUtils::mv file_already_existing, file_already_existing_temporary
      FileUtils::mkdir_p file_already_existing
      FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
      
      puts "#{file_already_existing} -> #{file_already_existing_permanent}"
      # retry the directory creation now that the path is clear
      structure_dir_path dir_path
    else
      raise "Unhandled directory restructure error: #{e.message}"
    end
  end
end

#submit_download_job(file_remote_info) ⇒ `Object`

helper to submit jobs and increment the counter

# File 'lib/wayback_machine_downloader.rb', line 679

def submit_download_job(file_remote_info)
  @pending_jobs.increment
  @worker_pool.post do
    begin
      process_single_file(file_remote_info)
    ensure
      @pending_jobs.decrement
    end
  end
end

Class: WaybackMachineDownloader

Constant Summary collapse

Constants included from URLRewrite

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from URLRewrite

Methods included from SubdomainProcessor

Methods included from ArchiveAPI

Constructor Details

#initialize(params) ⇒ WaybackMachineDownloader

Instance Attribute Details

#all ⇒ Object

#all_timestamps ⇒ Object

#base_url ⇒ Object

#directory ⇒ Object

#exact_url ⇒ Object

#exclude_filter ⇒ Object

#from_timestamp ⇒ Object

#keep ⇒ Object

#keep_duplicates ⇒ Object

#logger ⇒ Object

#maximum_pages ⇒ Object

#only_filter ⇒ Object

#page_requisites ⇒ Object

#reset ⇒ Object

#rewrite ⇒ Object

#snapshot_at ⇒ Object

#threads_count ⇒ Object

#to_timestamp ⇒ Object

Instance Method Details

#append_to_db(file_id) ⇒ Object

#backup_name ⇒ Object

#backup_path ⇒ Object

#cdx_path ⇒ Object

#color(text, color_code) ⇒ Object

#db_path ⇒ Object

#download_file(file_remote_info, http) ⇒ Object

#download_files ⇒ Object

#file_list_by_timestamp ⇒ Object

#file_queue ⇒ Object

#get_all_snapshots_to_consider ⇒ Object

#get_composite_snapshot_file_list(target_timestamp) ⇒ Object

#get_file_list_all_timestamps ⇒ Object

#get_file_list_by_timestamp ⇒ Object

#get_file_list_composite_snapshot(target_timestamp) ⇒ Object

#get_file_list_curated ⇒ Object

#handle_reset ⇒ Object

#list_files ⇒ Object

#load_downloaded_ids ⇒ Object

#local_path_for_file_id(file_id) ⇒ Object

#match_exclude_filter(file_url) ⇒ Object

#match_only_filter(file_url) ⇒ Object

#process_page_requisites(file_path, parent_remote_info) ⇒ Object

#process_single_file(file_remote_info) ⇒ Object

#processing_files(pool, files_to_process) ⇒ Object

#rewrite_local_files ⇒ Object

#rewrite_urls_to_relative(file_path) ⇒ Object

#structure_dir_path(dir_path) ⇒ Object

#submit_download_job(file_remote_info) ⇒ Object