Class: Archaeo::CdxApi

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/cdx_api.rb

Overview

Client for the Wayback Machine CDX Server API.

Supports all CDX features: field selection, filtering with regex, collapsing, resume-key pagination, page-based pagination, closest timestamp match, resolve revisits, and counters.

Constant Summary collapse

ENDPOINT =
"https://web.archive.org/cdx/search/cdx"
ALL_FIELDS =
%w[
  urlkey timestamp original
  mimetype statuscode digest length
].freeze
MATCH_TYPES =
%w[exact prefix host domain].freeze
SORT_ORDERS =
%w[default closest reverse].freeze
DEFAULT_LIMIT =
25_000
SCALAR_PARAMS =
{
  from: "from",
  to: "to",
  match_type: "matchType",
  sort: "sort",
  limit: "limit",
  closest: "closest",
  offset: "offset",
  page: "page",
  page_size: "pageSize",
  fast_latest: "fastLatest",
  resolve_revisits: "resolveRevisits",
  show_dupe_count: "showDupeCount",
  show_skip_count: "showSkipCount",
  last_skip_timestamp: "lastSkipTimestamp",
}.freeze

Instance Method Summary collapse

Constructor Details

#initialize(client: HttpClient.new) ⇒ CdxApi

Returns a new instance of CdxApi.



43
44
45
# File 'lib/archaeo/cdx_api.rb', line 43

def initialize(client: HttpClient.new)
  @client = client
end

Instance Method Details

#after(url, timestamp:) ⇒ Object

Raises:



93
94
95
96
97
98
99
100
# File 'lib/archaeo/cdx_api.rb', line 93

def after(url, timestamp:)
  ts = Timestamp.coerce(timestamp)
  snapshots(url, sort: "closest", closest: ts.to_s).each do |snap|
    return snap if snap.timestamp > ts
  end
  raise NoSnapshotFound,
        "No snapshot found after #{ts} for #{url}"
end

#before(url, timestamp:) ⇒ Object

Raises:



84
85
86
87
88
89
90
91
# File 'lib/archaeo/cdx_api.rb', line 84

def before(url, timestamp:)
  ts = Timestamp.coerce(timestamp)
  snapshots(url, sort: "closest", closest: ts.to_s).each do |snap|
    return snap if snap.timestamp < ts
  end
  raise NoSnapshotFound,
        "No snapshot found before #{ts} for #{url}"
end

#known_urls(domain, match_type: "domain") ⇒ Object

Returns all unique original URLs under a domain.



119
120
121
122
123
# File 'lib/archaeo/cdx_api.rb', line 119

def known_urls(domain, match_type: "domain")
  domain = UrlNormalizer.normalize(domain)
  snapshots(domain, match_type: match_type,
                    collapse: ["urlkey"]).map(&:original_url).uniq
end

#near(url, timestamp:) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/archaeo/cdx_api.rb', line 62

def near(url, timestamp:)
  url = UrlNormalizer.normalize(url)
  ts = Timestamp.coerce(timestamp)
  result = snapshots(url, sort: "closest",
                          closest: ts.to_s, limit: 1).first
  if result&.blocked?
    raise BlockedSiteError,
          "Site is blocked: #{url}"
  end

  result || raise(NoSnapshotFound,
                  "No snapshot found near #{ts} for #{url}")
end

#newest(url) ⇒ Object



80
81
82
# File 'lib/archaeo/cdx_api.rb', line 80

def newest(url)
  near(url, timestamp: Timestamp.now)
end

#num_pages(url, **options) ⇒ Object

Returns the number of pages for a paginated query.



103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/archaeo/cdx_api.rb', line 103

def num_pages(url, **options)
  url = UrlNormalizer.normalize(url)
  params = { "url" => url, "showNumPages" => "true" }
  merge_scalar_params!(params, options)
  response = @client.get(
    "#{ENDPOINT}?#{URI.encode_www_form(params)}",
  )
  unless response.status == 200
    raise Error,
          "CDX API returned HTTP #{response.status}"
  end

  response.body.strip.to_i
end

#oldest(url) ⇒ Object



76
77
78
# File 'lib/archaeo/cdx_api.rb', line 76

def oldest(url)
  near(url, timestamp: Timestamp.new(year: 1994, month: 1, day: 1))
end

#snapshots(url, **options) ⇒ Object

Returns an Enumerator of Snapshot objects, auto-paginating via resume key unless an explicit page is requested.



49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/archaeo/cdx_api.rb', line 49

def snapshots(url, **options)
  url = UrlNormalizer.normalize(url)
  validate_options!(options)

  Enumerator.new do |yielder|
    if options.key?(:page)
      fetch_page(url, options, yielder)
    else
      fetch_with_resume_key(url, options, yielder)
    end
  end
end