Class: Archaeo::CdxApi

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/cdx_api.rb

Overview

Client for the Wayback Machine CDX Server API.

Query archived snapshots by URL, timestamp range, filters, and more. Returns Snapshot objects for each matching CDX record.

Constant Summary collapse

ENDPOINT =
"https://web.archive.org/cdx/search/cdx"
ALL_FIELDS =
%w[
  urlkey timestamp original
  mimetype statuscode digest length
].freeze
MATCH_TYPES =
%w[exact prefix host domain].freeze
SORT_ORDERS =
%w[default closest reverse].freeze
DEFAULT_LIMIT =
25_000
SCALAR_PARAMS =
{
  from: "from",
  to: "to",
  match_type: "matchType",
  sort: "sort",
  limit: "limit",
  closest: "closest",
}.freeze

Instance Method Summary collapse

Constructor Details

#initialize(client: HttpClient.new) ⇒ CdxApi

Returns a new instance of CdxApi.



32
33
34
# File 'lib/archaeo/cdx_api.rb', line 32

def initialize(client: HttpClient.new)
  @client = client
end

Instance Method Details

#after(url, timestamp:) ⇒ Object

Raises:



69
70
71
72
73
74
75
76
# File 'lib/archaeo/cdx_api.rb', line 69

def after(url, timestamp:)
  ts = Timestamp.coerce(timestamp)
  snapshots(url, sort: "closest", closest: ts.to_s).each do |snap|
    return snap if snap.timestamp > ts
  end
  raise NoSnapshotFound,
        "No snapshot found after #{ts} for #{url}"
end

#before(url, timestamp:) ⇒ Object

Raises:



60
61
62
63
64
65
66
67
# File 'lib/archaeo/cdx_api.rb', line 60

def before(url, timestamp:)
  ts = Timestamp.coerce(timestamp)
  snapshots(url, sort: "closest", closest: ts.to_s).each do |snap|
    return snap if snap.timestamp < ts
  end
  raise NoSnapshotFound,
        "No snapshot found before #{ts} for #{url}"
end

#near(url, timestamp:) ⇒ Object



44
45
46
47
48
49
50
# File 'lib/archaeo/cdx_api.rb', line 44

def near(url, timestamp:)
  ts = Timestamp.coerce(timestamp)
  result = snapshots(url, sort: "closest",
                          closest: ts.to_s, limit: 1).first
  result || raise(NoSnapshotFound,
                  "No snapshot found near #{ts} for #{url}")
end

#newest(url) ⇒ Object



56
57
58
# File 'lib/archaeo/cdx_api.rb', line 56

def newest(url)
  near(url, timestamp: Timestamp.now)
end

#oldest(url) ⇒ Object



52
53
54
# File 'lib/archaeo/cdx_api.rb', line 52

def oldest(url)
  near(url, timestamp: Timestamp.new(year: 1994, month: 1, day: 1))
end

#snapshots(url, **options) ⇒ Object



36
37
38
39
40
41
42
# File 'lib/archaeo/cdx_api.rb', line 36

def snapshots(url, **options)
  validate_options!(options)

  Enumerator.new do |yielder|
    fetch_snapshots(url, options, yielder)
  end
end