Class: Spidra::Resources::Scrape

Inherits:
Object
  • Object
show all
Defined in:
lib/spidra/resources/scrape.rb

Constant Summary collapse

TERMINAL_STATUSES =
%w[completed failed cancelled].freeze

Instance Method Summary collapse

Constructor Details

#initialize(http) ⇒ Scrape

Returns a new instance of Scrape.



6
7
8
# File 'lib/spidra/resources/scrape.rb', line 6

def initialize(http)
  @http = http
end

Instance Method Details

#get(job_id) ⇒ Object

Get the current status of a scrape job.



16
17
18
# File 'lib/spidra/resources/scrape.rb', line 16

def get(job_id)
  @http.get("/scrape/#{job_id}")
end

#run(params, poll_interval: 3, timeout: 120) ⇒ Object

Submit a scrape job and wait for it to complete.

Parameters:

  • params (Hash)

    scrape parameters (urls, prompt, output, etc.)

  • poll_interval (Integer) (defaults to: 3)

    seconds between status checks (default: 3)

  • timeout (Integer) (defaults to: 120)

    seconds before giving up (default: 120)



25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/spidra/resources/scrape.rb', line 25

def run(params, poll_interval: 3, timeout: 120)
  response = submit(params)
  job_id   = response["jobId"]
  deadline = Time.now + timeout

  loop do
    status = get(job_id)
    return status.merge("jobId" => job_id) if TERMINAL_STATUSES.include?(status["status"])
    return { "status" => "timeout", "jobId" => job_id } if Time.now >= deadline

    sleep poll_interval
  end
end

#submit(params) ⇒ Object

Submit a scrape job. Returns immediately with a jobId.



11
12
13
# File 'lib/spidra/resources/scrape.rb', line 11

def submit(params)
  @http.post("/scrape", params)
end