Class: Archaeo::BulkDownloader

Inherits:

Object

Object
Archaeo::BulkDownloader

show all

Defined in:: lib/archaeo/bulk_downloader.rb

Overview

Downloads all archived snapshots of a URL with resume support.

Queries the CDX API for matching snapshots, fetches each page, and saves content to disk. Progress is tracked in a state file for interrupted download recovery.

Instance Method Summary collapse

#download(url, from: nil, to: nil, resume: false, dry_run: false, all_timestamps: false, filter: nil, page_requisites: false, snapshot_at: nil, max_snapshots: nil, strategy: nil, &block) ⇒ Object
#initialize(client: HttpClient.new, output_dir: "archive", cdx_api: nil, concurrency: 1, on_error: nil, rate_limiter: nil, path_sanitizer: nil) ⇒ BulkDownloader constructor

A new instance of BulkDownloader.

Constructor Details

#initialize(client: HttpClient.new, output_dir: "archive", cdx_api: nil, concurrency: 1, on_error: nil, rate_limiter: nil, path_sanitizer: nil) ⇒ `BulkDownloader`

Returns a new instance of BulkDownloader.

# File 'lib/archaeo/bulk_downloader.rb', line 17

def initialize(client: HttpClient.new, output_dir: "archive",
               cdx_api: nil, concurrency: 1, on_error: nil,
               rate_limiter: nil, path_sanitizer: nil)
  @client = client
  @output_dir = output_dir
  @cdx_api = cdx_api
  @concurrency = [1, concurrency.to_i].max
  @on_error = on_error
  @rate_limiter = rate_limiter || RateLimiter.new
  @path_sanitizer = path_sanitizer || PathSanitizer.new
end

Instance Method Details

#download(url, from: nil, to: nil, resume: false, dry_run: false, all_timestamps: false, filter: nil, page_requisites: false, snapshot_at: nil, max_snapshots: nil, strategy: nil, &block) ⇒ `Object`

# File 'lib/archaeo/bulk_downloader.rb', line 29

def download(url, from: nil, to: nil, resume: false,
             dry_run: false, all_timestamps: false,
             filter: nil, page_requisites: false,
             snapshot_at: nil, max_snapshots: nil,
             strategy: nil, &block)
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
  url = UrlNormalizer.normalize(url)
  FileUtils.mkdir_p(@output_dir) unless dry_run

  snapshots = fetch_snapshots(url, from: from, to: to,
                                   all_timestamps: all_timestamps,
                                   snapshot_at: snapshot_at)
  snapshots = apply_filter(snapshots, filter)
  snapshots = schedule_snapshots(snapshots, strategy)
  snapshots = snapshots.first(max_snapshots) if max_snapshots
  downloaded, skipped, bytes, failed =
    run_download(snapshots, resume, dry_run, page_requisites, block)

  build_summary(start_time, snapshots.size, downloaded,
                skipped, bytes, failed: failed)
end