Module: ArchiveAPI

Included in:
WaybackMachineDownloader
Defined in:
lib/wayback_machine_downloader/archive_api.rb

Instance Method Summary collapse

Instance Method Details

#get_raw_list_from_api(url, page_index, http) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/wayback_machine_downloader/archive_api.rb', line 6

def get_raw_list_from_api(url, page_index, http)
  # Automatically append /* for host-only URLs
  # This is a workaround for an issue with the API and *some* domains.
  # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
  # But don't do this when exact_url flag is set, and never append twice
  if url && !@exact_url
    normalized_url = url.to_s
    has_wildcard = normalized_url.include?('*')
    host_and_rest = normalized_url
      .sub(/\Ahttps?:\/\//i, '')
      .split(/[?#]/, 2)
      .first
    has_path = host_and_rest.include?('/')

    unless has_wildcard || has_path
      url = "#{normalized_url}/*"
    end
  end

  request_url = URI("https://web.archive.org/cdx/search/cdx")
  params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
  request_url.query = URI.encode_www_form(params)

  retries = 0
  max_retries = (@max_retries || 3)
  delay = WaybackMachineDownloader::RETRY_DELAY rescue 2

  begin
    request = Net::HTTP::Get.new(request_url)
    request["User-Agent"] = "wmd-straw/#{WaybackMachineDownloader::VERSION}"
    request["Connection"] = "keep-alive"
    request["Accept-Encoding"] = "gzip"
    response = http.request(request)

    case response.code.to_i
    when 200
      body = if response['content-encoding'] == 'gzip'
        Zlib::GzipReader.new(StringIO.new(response.body)).read
      else
        response.body.to_s.strip
      end
      return [] if body.empty?
      begin
        json = JSON.parse(body)
        # check if the response contains the header ["timestamp", "original"]
        json.shift if json.first == ["timestamp", "original"]
        json
      rescue JSON::ParserError => e
        raise "Malformed JSON response: #{e.message}"
      end
    when 429, 500, 502, 503, 504
      raise "Server error #{response.code}: #{response.message}"
    else
      warn "Unexpected API response #{response.code} for #{url}"
      []
    end
  rescue Net::ReadTimeout, Net::OpenTimeout, StandardError => e
    if retries < max_retries
      retries += 1
      warn "Error talking to Wayback CDX API (#{e.class}: #{e.message}) for #{url}, retry #{retries}/#{max_retries}..."
      sleep(delay * retries)
      retry
    else
      warn "Giving up on Wayback CDX API for #{url} after #{max_retries} attempts. (Last error: #{e.message})"
      []
    end
  end
end

#parameters_for_api(page_index) ⇒ Object



75
76
77
78
79
80
81
82
83
# File 'lib/wayback_machine_downloader/archive_api.rb', line 75

def parameters_for_api(page_index)
  parameters = [["fl", "timestamp,original"], ["gzip", "true"]]
  parameters.push(["collapse", "digest"]) unless @keep_duplicates || @all_timestamps
  parameters.push(["filter", "statuscode:2..|30[12378]"]) unless @all
  parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
  parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
  parameters.push(["page", page_index]) if page_index
  parameters
end