Class: Spidra::Resources::Crawl

Inherits:

Object

Object
Spidra::Resources::Crawl

show all

Defined in:: lib/spidra/resources/crawl.rb

Constant Summary collapse

TERMINAL_STATUSES =

%w[completed failed cancelled].freeze

Instance Method Summary collapse

#extract(job_id, transform_instruction) ⇒ Object

Re-extract data from an existing completed crawl with a new transform instruction.
#get(job_id) ⇒ Object

Get the current status of a crawl job.
#history(page = 1, limit = 20) ⇒ Object

List past crawl jobs, newest first.
#initialize(http) ⇒ Crawl constructor

A new instance of Crawl.
#pages(job_id) ⇒ Object

Get signed S3 URLs to download the HTML and Markdown for each crawled page.
#run(params, poll_interval: 3, timeout: 300) ⇒ Object

Submit a crawl job and wait for it to complete.
#stats ⇒ Object

Get total crawl job count for the authenticated account.
#submit(params) ⇒ Object

Submit a crawl job.

Constructor Details

#initialize(http) ⇒ `Crawl`

Returns a new instance of Crawl.



6
7
8

# File 'lib/spidra/resources/crawl.rb', line 6

def initialize(http)
  @http = http
end

Instance Method Details

#extract(job_id, transform_instruction) ⇒ `Object`

Re-extract data from an existing completed crawl with a new transform instruction. Only transformation credits are charged — no re-crawling happens.

Parameters:

job_id (String) —

the completed crawl job ID
transform_instruction (String) —

the new extraction prompt



44
45
46

# File 'lib/spidra/resources/crawl.rb', line 44

def extract(job_id, transform_instruction)
  @http.post("/crawl/#{job_id}/extract", { transformInstruction: transform_instruction })
end

#get(job_id) ⇒ `Object`

Get the current status of a crawl job.



29
30
31

# File 'lib/spidra/resources/crawl.rb', line 29

def get(job_id)
  @http.get("/crawl/#{job_id}")
end

#history(page = 1, limit = 20) ⇒ `Object`

List past crawl jobs, newest first.

Parameters:

page (Integer) (defaults to: 1) —

page number (default: 1)
limit (Integer) (defaults to: 20) —

results per page (default: 20)



19
20
21

# File 'lib/spidra/resources/crawl.rb', line 19

def history(page = 1, limit = 20)
  @http.get("/crawl/history", page: page, limit: limit)
end

#pages(job_id) ⇒ `Object`

Get signed S3 URLs to download the HTML and Markdown for each crawled page. URLs expire after 1 hour.



35
36
37

# File 'lib/spidra/resources/crawl.rb', line 35

def pages(job_id)
  @http.get("/crawl/#{job_id}/pages")
end

#run(params, poll_interval: 3, timeout: 300) ⇒ `Object`

Submit a crawl job and wait for it to complete.

Parameters:

params (Hash) —

crawl parameters (base_url, crawl_instruction, etc.)
poll_interval (Integer) (defaults to: 3) —

seconds between status checks (default: 3)
timeout (Integer) (defaults to: 300) —

seconds before giving up (default: 300)

# File 'lib/spidra/resources/crawl.rb', line 53

def run(params, poll_interval: 3, timeout: 300)
  response = submit(params)
  job_id   = response["jobId"]
  deadline = Time.now + timeout

  loop do
    status = get(job_id)
    return status.merge("jobId" => job_id) if TERMINAL_STATUSES.include?(status["status"])
    return { "status" => "timeout", "jobId" => job_id } if Time.now >= deadline

    sleep poll_interval
  end
end

#stats ⇒ `Object`

Get total crawl job count for the authenticated account.



11
12
13

# File 'lib/spidra/resources/crawl.rb', line 11

def stats
  @http.get("/crawl/stats")
end

#submit(params) ⇒ `Object`

Submit a crawl job. Returns immediately with a jobId.



24
25
26

# File 'lib/spidra/resources/crawl.rb', line 24

def submit(params)
  @http.post("/crawl", params)
end

Class: Spidra::Resources::Crawl

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(http) ⇒ Crawl

Instance Method Details

#extract(job_id, transform_instruction) ⇒ Object

#get(job_id) ⇒ Object

#history(page = 1, limit = 20) ⇒ Object

#pages(job_id) ⇒ Object

#run(params, poll_interval: 3, timeout: 300) ⇒ Object

#stats ⇒ Object

#submit(params) ⇒ Object

#initialize(http) ⇒ `Crawl`

#extract(job_id, transform_instruction) ⇒ `Object`

#get(job_id) ⇒ `Object`

#history(page = 1, limit = 20) ⇒ `Object`

#pages(job_id) ⇒ `Object`

#run(params, poll_interval: 3, timeout: 300) ⇒ `Object`

#stats ⇒ `Object`

#submit(params) ⇒ `Object`