Class: Spidra::Resources::Crawl

Inherits:
Object
  • Object
show all
Defined in:
lib/spidra/resources/crawl.rb

Constant Summary collapse

TERMINAL_STATUSES =
%w[completed failed cancelled].freeze

Instance Method Summary collapse

Constructor Details

#initialize(http) ⇒ Crawl

Returns a new instance of Crawl.



6
7
8
# File 'lib/spidra/resources/crawl.rb', line 6

def initialize(http)
  @http = http
end

Instance Method Details

#extract(job_id, transform_instruction) ⇒ Object

Re-extract data from an existing completed crawl with a new transform instruction. Only transformation credits are charged — no re-crawling happens.

Parameters:

  • job_id (String)

    the completed crawl job ID

  • transform_instruction (String)

    the new extraction prompt



44
45
46
# File 'lib/spidra/resources/crawl.rb', line 44

def extract(job_id, transform_instruction)
  @http.post("/crawl/#{job_id}/extract", { transformInstruction: transform_instruction })
end

#get(job_id) ⇒ Object

Get the current status of a crawl job.



29
30
31
# File 'lib/spidra/resources/crawl.rb', line 29

def get(job_id)
  @http.get("/crawl/#{job_id}")
end

#history(page = 1, limit = 20) ⇒ Object

List past crawl jobs, newest first.

Parameters:

  • page (Integer) (defaults to: 1)

    page number (default: 1)

  • limit (Integer) (defaults to: 20)

    results per page (default: 20)



19
20
21
# File 'lib/spidra/resources/crawl.rb', line 19

def history(page = 1, limit = 20)
  @http.get("/crawl/history", page: page, limit: limit)
end

#pages(job_id) ⇒ Object

Get signed S3 URLs to download the HTML and Markdown for each crawled page. URLs expire after 1 hour.



35
36
37
# File 'lib/spidra/resources/crawl.rb', line 35

def pages(job_id)
  @http.get("/crawl/#{job_id}/pages")
end

#run(params, poll_interval: 3, timeout: 300) ⇒ Object

Submit a crawl job and wait for it to complete.

Parameters:

  • params (Hash)

    crawl parameters (base_url, crawl_instruction, etc.)

  • poll_interval (Integer) (defaults to: 3)

    seconds between status checks (default: 3)

  • timeout (Integer) (defaults to: 300)

    seconds before giving up (default: 300)



53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/spidra/resources/crawl.rb', line 53

def run(params, poll_interval: 3, timeout: 300)
  response = submit(params)
  job_id   = response["jobId"]
  deadline = Time.now + timeout

  loop do
    status = get(job_id)
    return status.merge("jobId" => job_id) if TERMINAL_STATUSES.include?(status["status"])
    return { "status" => "timeout", "jobId" => job_id } if Time.now >= deadline

    sleep poll_interval
  end
end

#statsObject

Get total crawl job count for the authenticated account.



11
12
13
# File 'lib/spidra/resources/crawl.rb', line 11

def stats
  @http.get("/crawl/stats")
end

#submit(params) ⇒ Object

Submit a crawl job. Returns immediately with a jobId.



24
25
26
# File 'lib/spidra/resources/crawl.rb', line 24

def submit(params)
  @http.post("/crawl", params)
end