6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
# File 'lib/wayback_machine_downloader/archive_api.rb', line 6
def get_raw_list_from_api(url, page_index, http)
if url && !@exact_url
normalized_url = url.to_s
has_wildcard = normalized_url.include?('*')
host_and_rest = normalized_url
.sub(/\Ahttps?:\/\//i, '')
.split(/[?#]/, 2)
.first
has_path = host_and_rest.include?('/')
unless has_wildcard || has_path
url = "#{normalized_url}/*"
end
end
request_url = URI("https://web.archive.org/cdx/search/cdx")
params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
request_url.query = URI.encode_www_form(params)
retries = 0
max_retries = (@max_retries || 3)
delay = WaybackMachineDownloader::RETRY_DELAY rescue 2
begin
request = Net::HTTP::Get.new(request_url)
request["User-Agent"] = "wmd-straw/#{WaybackMachineDownloader::VERSION}"
request["Connection"] = "keep-alive"
request["Accept-Encoding"] = "gzip"
response = http.request(request)
case response.code.to_i
when 200
body = if response['content-encoding'] == 'gzip'
Zlib::GzipReader.new(StringIO.new(response.body)).read
else
response.body.to_s.strip
end
return [] if body.empty?
begin
json = JSON.parse(body)
json.shift if json.first == ["timestamp", "original"]
json
rescue JSON::ParserError => e
raise "Malformed JSON response: #{e.message}"
end
when 429, 500, 502, 503, 504
raise "Server error #{response.code}: #{response.message}"
else
warn "Unexpected API response #{response.code} for #{url}"
[]
end
rescue Net::ReadTimeout, Net::OpenTimeout, StandardError => e
if retries < max_retries
retries += 1
warn "Error talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
sleep(delay * retries)
retry
else
warn "Giving up on Wayback CDX API for #{url} after #{max_retries} attempts. (Last error: #{e.message})"
[]
end
end
end
|