Class: Archaeo::CdxApi
- Inherits:
-
Object
- Object
- Archaeo::CdxApi
- Defined in:
- lib/archaeo/cdx_api.rb
Overview
Client for the Wayback Machine CDX Server API.
Supports all CDX features: field selection, filtering with regex, collapsing, resume-key pagination, page-based pagination, closest timestamp match, resolve revisits, and counters.
Constant Summary collapse
- ENDPOINT =
"https://web.archive.org/cdx/search/cdx"- ALL_FIELDS =
%w[ urlkey timestamp original mimetype statuscode digest length ].freeze
- MATCH_TYPES =
%w[exact prefix host domain].freeze
- SORT_ORDERS =
%w[default closest reverse].freeze
- DEFAULT_LIMIT =
25_000- SCALAR_PARAMS =
{ from: "from", to: "to", match_type: "matchType", sort: "sort", limit: "limit", closest: "closest", offset: "offset", page: "page", page_size: "pageSize", fast_latest: "fastLatest", resolve_revisits: "resolveRevisits", show_dupe_count: "showDupeCount", show_skip_count: "showSkipCount", last_skip_timestamp: "lastSkipTimestamp", }.freeze
Instance Method Summary collapse
- #after(url, timestamp:) ⇒ Object
- #before(url, timestamp:) ⇒ Object
- #between(url, from:, to:, **options) ⇒ Object
-
#composite_snapshot(url, timestamp:, collapse: []) ⇒ Object
Returns one snapshot per unique URL, picking the newest at or before the given timestamp for point-in-time site reconstruction.
- #count(url, **options) ⇒ Object
-
#initialize(client: HttpClient.new, cache_dir: nil) ⇒ CdxApi
constructor
A new instance of CdxApi.
-
#known_urls(domain, match_type: "domain") ⇒ Object
Returns all unique original URLs under a domain.
- #near(url, timestamp:) ⇒ Object
- #newest(url) ⇒ Object
-
#num_pages(url, **options) ⇒ Object
Returns the number of pages for a paginated query.
- #oldest(url) ⇒ Object
-
#snapshots(url, **options) ⇒ Object
Returns an Enumerator of Snapshot objects, auto-paginating via resume key unless an explicit page is requested.
- #timeline(url, from: nil, to: nil, bucket_size: :month, status: 200) ⇒ Object
- #unique_snapshots(url, resolve_revisits: true, **options) ⇒ Object
Constructor Details
#initialize(client: HttpClient.new, cache_dir: nil) ⇒ CdxApi
Returns a new instance of CdxApi.
43 44 45 46 |
# File 'lib/archaeo/cdx_api.rb', line 43 def initialize(client: HttpClient.new, cache_dir: nil) @client = client @cache = cache_dir ? CdxCache.new(cache_dir) : nil end |
Instance Method Details
#after(url, timestamp:) ⇒ Object
107 108 109 110 111 112 113 114 |
# File 'lib/archaeo/cdx_api.rb', line 107 def after(url, timestamp:) ts = Timestamp.coerce() snapshots(url, sort: "closest", closest: ts.to_s).each do |snap| return snap if snap. > ts end raise NoSnapshotFound, "No snapshot found after #{ts} for #{url}" end |
#before(url, timestamp:) ⇒ Object
98 99 100 101 102 103 104 105 |
# File 'lib/archaeo/cdx_api.rb', line 98 def before(url, timestamp:) ts = Timestamp.coerce() snapshots(url, sort: "closest", closest: ts.to_s).each do |snap| return snap if snap. < ts end raise NoSnapshotFound, "No snapshot found before #{ts} for #{url}" end |
#between(url, from:, to:, **options) ⇒ Object
116 117 118 119 120 121 |
# File 'lib/archaeo/cdx_api.rb', line 116 def between(url, from:, to:, **) snapshots(url, from: Timestamp.coerce(from).to_s, to: Timestamp.coerce(to).to_s, **) end |
#composite_snapshot(url, timestamp:, collapse: []) ⇒ Object
Returns one snapshot per unique URL, picking the newest at or before the given timestamp for point-in-time site reconstruction.
63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/archaeo/cdx_api.rb', line 63 def composite_snapshot(url, timestamp:, collapse: []) ts = Timestamp.coerce() = { to: ts.to_s, sort: "reverse" } [:collapse] = collapse unless collapse.empty? seen = {} snapshots(url, **).each do |snap| key = snap.original_url seen[key] = snap unless seen.key?(key) end seen.values end |
#count(url, **options) ⇒ Object
123 124 125 |
# File 'lib/archaeo/cdx_api.rb', line 123 def count(url, **) snapshots(url, **).count end |
#known_urls(domain, match_type: "domain") ⇒ Object
Returns all unique original URLs under a domain.
162 163 164 165 166 |
# File 'lib/archaeo/cdx_api.rb', line 162 def known_urls(domain, match_type: "domain") domain = UrlNormalizer.normalize(domain) snapshots(domain, match_type: match_type, collapse: ["urlkey"]).map(&:original_url).uniq end |
#near(url, timestamp:) ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/archaeo/cdx_api.rb', line 76 def near(url, timestamp:) url = UrlNormalizer.normalize(url) ts = Timestamp.coerce() result = snapshots(url, sort: "closest", closest: ts.to_s, limit: 1).first if result&.blocked? raise BlockedSiteError, "Site is blocked: #{url}" end result || raise(NoSnapshotFound, "No snapshot found near #{ts} for #{url}") end |
#newest(url) ⇒ Object
94 95 96 |
# File 'lib/archaeo/cdx_api.rb', line 94 def newest(url) near(url, timestamp: Timestamp.now) end |
#num_pages(url, **options) ⇒ Object
Returns the number of pages for a paginated query.
146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/archaeo/cdx_api.rb', line 146 def num_pages(url, **) url = UrlNormalizer.normalize(url) params = { "url" => url, "showNumPages" => "true" } merge_scalar_params!(params, ) response = @client.get( "#{ENDPOINT}?#{URI.encode_www_form(params)}", ) unless response.status == 200 raise Error, "CDX API returned HTTP #{response.status}" end response.body.strip.to_i end |
#oldest(url) ⇒ Object
90 91 92 |
# File 'lib/archaeo/cdx_api.rb', line 90 def oldest(url) near(url, timestamp: Timestamp.new(year: 1994, month: 1, day: 1)) end |
#snapshots(url, **options) ⇒ Object
Returns an Enumerator of Snapshot objects, auto-paginating via resume key unless an explicit page is requested.
50 51 52 53 54 55 56 57 58 59 |
# File 'lib/archaeo/cdx_api.rb', line 50 def snapshots(url, **) url = UrlNormalizer.normalize(url) () if @cache && !.key?(:page) return cached_snapshots(url, ) end build_enumerator(url, ) end |
#timeline(url, from: nil, to: nil, bucket_size: :month, status: 200) ⇒ Object
134 135 136 137 138 139 140 141 142 143 |
# File 'lib/archaeo/cdx_api.rb', line 134 def timeline(url, from: nil, to: nil, bucket_size: :month, status: 200) = {} [:from] = Timestamp.coerce(from).to_s if from [:to] = Timestamp.coerce(to).to_s if to [:filters] = [CdxFilter.by_status(status)] if status snaps = snapshots(url, **).to_a CdxTimeline.new(snaps, bucket_size: bucket_size) end |
#unique_snapshots(url, resolve_revisits: true, **options) ⇒ Object
127 128 129 130 131 132 |
# File 'lib/archaeo/cdx_api.rb', line 127 def unique_snapshots(url, resolve_revisits: true, **) snapshots(url, collapse: ["digest"], resolve_revisits: resolve_revisits, **) end |