Class: Archaeo::BulkDownloader

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/bulk_downloader.rb

Overview

Downloads all archived snapshots of a URL with resume support.

Queries the CDX API for matching snapshots, fetches each page, and saves content to disk. Progress is tracked in a state file for interrupted download recovery.

Instance Method Summary collapse

Constructor Details

#initialize(client: HttpClient.new, output_dir: "archive") ⇒ BulkDownloader

Returns a new instance of BulkDownloader.



12
13
14
15
# File 'lib/archaeo/bulk_downloader.rb', line 12

def initialize(client: HttpClient.new, output_dir: "archive")
  @client = client
  @output_dir = output_dir
end

Instance Method Details

#download(url, from: nil, to: nil, resume: false) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/archaeo/bulk_downloader.rb', line 17

def download(url, from: nil, to: nil, resume: false)
  url = UrlNormalizer.normalize(url)
  FileUtils.mkdir_p(@output_dir)
  state = DownloadState.new(@output_dir)

  snapshots = fetch_snapshots(url, from: from, to: to)
  total = snapshots.size

  snapshots.each_with_index do |snap, index|
    next if resume && state.completed?(snap.timestamp)

    fetch_and_save(snap)
    state.mark_completed(snap.timestamp)

    yield index + 1, total, snap if block_given?
  end
end