Class: Archaeo::CdxApi

Inherits:
Object
  • Object
show all
Defined in:
lib/archaeo/cdx_api.rb

Overview

Client for the Wayback Machine CDX Server API.

Supports all CDX features: field selection, filtering with regex, collapsing, resume-key pagination, page-based pagination, closest timestamp match, resolve revisits, and counters.

Constant Summary collapse

ENDPOINT =
"https://web.archive.org/cdx/search/cdx"
ALL_FIELDS =
%w[
  urlkey timestamp original
  mimetype statuscode digest length
].freeze
MATCH_TYPES =
%w[exact prefix host domain].freeze
SORT_ORDERS =
%w[default closest reverse].freeze
DEFAULT_LIMIT =
25_000
SCALAR_PARAMS =
{
  from: "from",
  to: "to",
  match_type: "matchType",
  sort: "sort",
  limit: "limit",
  closest: "closest",
  offset: "offset",
  page: "page",
  page_size: "pageSize",
  fast_latest: "fastLatest",
  resolve_revisits: "resolveRevisits",
  show_dupe_count: "showDupeCount",
  show_skip_count: "showSkipCount",
  last_skip_timestamp: "lastSkipTimestamp",
}.freeze

Instance Method Summary collapse

Constructor Details

#initialize(client: HttpClient.new, cache_dir: nil) ⇒ CdxApi

Returns a new instance of CdxApi.



43
44
45
46
# File 'lib/archaeo/cdx_api.rb', line 43

def initialize(client: HttpClient.new, cache_dir: nil)
  @client = client
  @cache = cache_dir ? CdxCache.new(cache_dir) : nil
end

Instance Method Details

#after(url, timestamp:) ⇒ Object

Raises:



107
108
109
110
111
112
113
114
# File 'lib/archaeo/cdx_api.rb', line 107

def after(url, timestamp:)
  ts = Timestamp.coerce(timestamp)
  snapshots(url, sort: "closest", closest: ts.to_s).each do |snap|
    return snap if snap.timestamp > ts
  end
  raise NoSnapshotFound,
        "No snapshot found after #{ts} for #{url}"
end

#before(url, timestamp:) ⇒ Object

Raises:



98
99
100
101
102
103
104
105
# File 'lib/archaeo/cdx_api.rb', line 98

def before(url, timestamp:)
  ts = Timestamp.coerce(timestamp)
  snapshots(url, sort: "closest", closest: ts.to_s).each do |snap|
    return snap if snap.timestamp < ts
  end
  raise NoSnapshotFound,
        "No snapshot found before #{ts} for #{url}"
end

#between(url, from:, to:, **options) ⇒ Object



116
117
118
119
120
121
# File 'lib/archaeo/cdx_api.rb', line 116

def between(url, from:, to:, **options)
  snapshots(url,
            from: Timestamp.coerce(from).to_s,
            to: Timestamp.coerce(to).to_s,
            **options)
end

#composite_snapshot(url, timestamp:, collapse: []) ⇒ Object

Returns one snapshot per unique URL, picking the newest at or before the given timestamp for point-in-time site reconstruction.



63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/archaeo/cdx_api.rb', line 63

def composite_snapshot(url, timestamp:, collapse: [])
  ts = Timestamp.coerce(timestamp)
  options = { to: ts.to_s, sort: "reverse" }
  options[:collapse] = collapse unless collapse.empty?

  seen = {}
  snapshots(url, **options).each do |snap|
    key = snap.original_url
    seen[key] = snap unless seen.key?(key)
  end
  seen.values
end

#count(url, **options) ⇒ Object



123
124
125
# File 'lib/archaeo/cdx_api.rb', line 123

def count(url, **options)
  snapshots(url, **options).count
end

#known_urls(domain, match_type: "domain") ⇒ Object

Returns all unique original URLs under a domain.



162
163
164
165
166
# File 'lib/archaeo/cdx_api.rb', line 162

def known_urls(domain, match_type: "domain")
  domain = UrlNormalizer.normalize(domain)
  snapshots(domain, match_type: match_type,
                    collapse: ["urlkey"]).map(&:original_url).uniq
end

#near(url, timestamp:) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/archaeo/cdx_api.rb', line 76

def near(url, timestamp:)
  url = UrlNormalizer.normalize(url)
  ts = Timestamp.coerce(timestamp)
  result = snapshots(url, sort: "closest",
                          closest: ts.to_s, limit: 1).first
  if result&.blocked?
    raise BlockedSiteError,
          "Site is blocked: #{url}"
  end

  result || raise(NoSnapshotFound,
                  "No snapshot found near #{ts} for #{url}")
end

#newest(url) ⇒ Object



94
95
96
# File 'lib/archaeo/cdx_api.rb', line 94

def newest(url)
  near(url, timestamp: Timestamp.now)
end

#num_pages(url, **options) ⇒ Object

Returns the number of pages for a paginated query.



146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/archaeo/cdx_api.rb', line 146

def num_pages(url, **options)
  url = UrlNormalizer.normalize(url)
  params = { "url" => url, "showNumPages" => "true" }
  merge_scalar_params!(params, options)
  response = @client.get(
    "#{ENDPOINT}?#{URI.encode_www_form(params)}",
  )
  unless response.status == 200
    raise Error,
          "CDX API returned HTTP #{response.status}"
  end

  response.body.strip.to_i
end

#oldest(url) ⇒ Object



90
91
92
# File 'lib/archaeo/cdx_api.rb', line 90

def oldest(url)
  near(url, timestamp: Timestamp.new(year: 1994, month: 1, day: 1))
end

#snapshots(url, **options) ⇒ Object

Returns an Enumerator of Snapshot objects, auto-paginating via resume key unless an explicit page is requested.



50
51
52
53
54
55
56
57
58
59
# File 'lib/archaeo/cdx_api.rb', line 50

def snapshots(url, **options)
  url = UrlNormalizer.normalize(url)
  validate_options!(options)

  if @cache && !options.key?(:page)
    return cached_snapshots(url, options)
  end

  build_enumerator(url, options)
end

#timeline(url, from: nil, to: nil, bucket_size: :month, status: 200) ⇒ Object



134
135
136
137
138
139
140
141
142
143
# File 'lib/archaeo/cdx_api.rb', line 134

def timeline(url, from: nil, to: nil,
             bucket_size: :month, status: 200)
  options = {}
  options[:from] = Timestamp.coerce(from).to_s if from
  options[:to] = Timestamp.coerce(to).to_s if to
  options[:filters] = [CdxFilter.by_status(status)] if status

  snaps = snapshots(url, **options).to_a
  CdxTimeline.new(snaps, bucket_size: bucket_size)
end

#unique_snapshots(url, resolve_revisits: true, **options) ⇒ Object



127
128
129
130
131
132
# File 'lib/archaeo/cdx_api.rb', line 127

def unique_snapshots(url, resolve_revisits: true, **options)
  snapshots(url,
            collapse: ["digest"],
            resolve_revisits: resolve_revisits,
            **options)
end