Module: Scrapetor

Defined in:: lib/scrapetor.rb,
lib/scrapetor/dom.rb,
lib/scrapetor/sax.rb,
lib/scrapetor/url.rb,
lib/scrapetor/form.rb,
lib/scrapetor/http.rb,
lib/scrapetor/node.rb,
lib/scrapetor/money.rb,
lib/scrapetor/xpath.rb,
lib/scrapetor/errors.rb,
lib/scrapetor/native.rb,
lib/scrapetor/robots.rb,
lib/scrapetor/schema.rb,
lib/scrapetor/stream.rb,
lib/scrapetor/builder.rb,
lib/scrapetor/cleaner.rb,
lib/scrapetor/fetcher.rb,
lib/scrapetor/session.rb,
lib/scrapetor/sitemap.rb,
lib/scrapetor/version.rb,
lib/scrapetor/document.rb,
lib/scrapetor/encoding.rb,
lib/scrapetor/entities.rb,
lib/scrapetor/node_set.rb,
lib/scrapetor/selector.rb,
lib/scrapetor/extractor.rb,
lib/scrapetor/microdata.rb,
lib/scrapetor/page_type.rb,
lib/scrapetor/text_node.rb,
lib/scrapetor/dom/parser.rb,
lib/scrapetor/native_dom.rb,
lib/scrapetor/pagination.rb,
lib/scrapetor/fingerprint.rb,
lib/scrapetor/comment_node.rb,
lib/scrapetor/dom/selectors.rb,
lib/scrapetor/structured_data.rb,
lib/scrapetor/persistent_cache.rb,
lib/scrapetor/template_registry.rb,
ext/scrapetor/native/scrapetor_native.c

Defined Under Namespace

Modules: Cleaner, Dom, Encoding, Entities, Extractor, Fetcher, Fingerprint, HTML, HTML5, HTTP, Microdata, Money, Native, PageType, Pagination, PersistentCache, RDFa, SAX, Selector, Sitemap, StructuredData, URL, XPath Classes: Builder, CommentNode, Document, Error, ExtractionError, Form, Node, NodeSet, Robots, Schema, SchemaError, Session, Stream, TemplateRegistry, TextNode

Constant Summary collapse

VERSION =

"0.2.0"

Class Method Summary collapse

.default_parallel_threads(n_items) ⇒ Object
.each_page(start_url, **opts, &block) ⇒ Object

Top-level shorthand.
.extract(html, schema = nil, base_url: nil, &block) ⇒ Object
.extract_file(path, schema, base_url: nil) ⇒ Object

Run an extraction schema directly against a file or IO.
.extract_native(html, schema, base_url: nil) ⇒ Object

Force the native streaming path.
.extract_ruby(html, schema, base_url: nil) ⇒ Object

Force the Ruby reference path.
.fetch(url, **opts) ⇒ Object

Module-level shortcut.
.fetch_extract(url, schema, **opts) ⇒ Object
.fetch_http2(url, **opts) ⇒ Object

Top-level shorthand for the libcurl path.
.HTML(html, base_url = nil) ⇒ Object

‘Scrapetor::HTML(html)` — capital-H convenience method.
.HTML5(*args, &block) ⇒ Object

‘Scrapetor::HTML5(html)` — same parser, alternate name.
.parallel_parse(htmls, threads: nil) ⇒ Object

Parse N documents in parallel via native pthread workers, releasing the GVL for the duration.
.parse(html, base_url: nil, build_indexes: false) ⇒ Object

—– Parsing entry points —–.
.parse_file(path, base_url: nil) ⇒ Object
.parse_fragment(html, base_url: nil) ⇒ Object
.parse_html(html, base_url: nil) ⇒ Object
.parse_io(io, base_url: nil) ⇒ Object

Parse from an arbitrary IO-like (responds to ‘read`) or a file path.
.schema(&block) ⇒ Object

—– Extraction DSL —–.
.stream(source, outer:, fields: nil, chunk_size: Stream::DEFAULT_CHUNK, &block) ⇒ Object

Class Method Details

.default_parallel_threads(n_items) ⇒ `Object`

# File 'lib/scrapetor.rb', line 103

def self.default_parallel_threads(n_items)
  cpu = begin
    require "etc"
    Etc.nprocessors
  rescue StandardError
    4
  end
  [n_items, cpu].min
end

.each_page(start_url, **opts, &block) ⇒ `Object`

Top-level shorthand.



106
107
108

# File 'lib/scrapetor/pagination.rb', line 106

def self.each_page(start_url, **opts, &block)
  Pagination.each_page(start_url, **opts, &block)
end

.extract(html, schema = nil, base_url: nil, &block) ⇒ `Object`



150
151
152

# File 'lib/scrapetor.rb', line 150

def self.extract(html, schema = nil, base_url: nil, &block)
  parse(html, base_url: base_url).extract(schema, &block)
end

.extract_file(path, schema, base_url: nil) ⇒ `Object`

Run an extraction schema directly against a file or IO.



114
115
116

# File 'lib/scrapetor.rb', line 114

def self.extract_file(path, schema, base_url: nil)
  extract(File.read(path), schema, base_url: base_url)
end

.extract_native(html, schema, base_url: nil) ⇒ `Object`

Force the native streaming path. Raises if the schema can’t compile.

Raises:

(Error)

# File 'lib/scrapetor.rb', line 155

def self.extract_native(html, schema, base_url: nil)
  raise Error, "native extension not loaded" unless Native.available?
  desc = Native.compile_descriptor(schema)
  raise Error, "schema not native-compilable" unless desc
  Native.extract(html.to_s, desc, base_url)
end

.extract_ruby(html, schema, base_url: nil) ⇒ `Object`

Force the Ruby reference path. Useful for parity tests + benchmarks.

# File 'lib/scrapetor.rb', line 163

def self.extract_ruby(html, schema, base_url: nil)
  doc = parse(html, base_url: base_url)
  Extractor.run(doc, doc.backing, schema)
end

.fetch(url, **opts) ⇒ `Object`

Module-level shortcut. Most users only want this.



107
108
109

# File 'lib/scrapetor/http.rb', line 107

def self.fetch(url, **opts)
  HTTP.fetch(url, **opts)
end

.fetch_extract(url, schema, **opts) ⇒ `Object`



111
112
113

# File 'lib/scrapetor/http.rb', line 111

def self.fetch_extract(url, schema, **opts)
  HTTP.fetch_extract(url, schema, **opts)
end

.fetch_http2(url, **opts) ⇒ `Object`

Top-level shorthand for the libcurl path. Distinct from Scrapetor.fetch (Net::HTTP) so callers can opt-in to HTTP/2 + connection reuse where it’s actually available.



387
388
389

# File 'lib/scrapetor/fetcher.rb', line 387

def self.fetch_http2(url, **opts)
  Fetcher.fetch(url, **opts)
end

.HTML(html, base_url = nil) ⇒ `Object`

‘Scrapetor::HTML(html)` — capital-H convenience method.



61
62
63

# File 'lib/scrapetor.rb', line 61

def self.HTML(html, base_url = nil)
  parse(html, base_url: base_url)
end

.HTML5(*args, &block) ⇒ `Object`

‘Scrapetor::HTML5(html)` — same parser, alternate name.



119
120
121

# File 'lib/scrapetor.rb', line 119

def self.HTML5(*args, &block)
  parse(*args, &block)
end

.parallel_parse(htmls, threads: nil) ⇒ `Object`

Parse N documents in parallel via native pthread workers, releasing the GVL for the duration. Returns Array<Scrapetor::Document> in the same order as the input. Skips the in-memory parse cache (which is GVL-bound); use single-document Scrapetor.parse for cache-friendly workloads.

Use this for batch jobs over distinct documents where parsing dominates: pre-warming a fixture corpus, indexing a crawl, A/B comparing parsed shapes. Falls through to a serial parse when only one document is provided.

# File 'lib/scrapetor.rb', line 92

def self.parallel_parse(htmls, threads: nil)
  htmls = Array(htmls)
  return [] if htmls.empty?
  return [parse(htmls.first)] if htmls.size == 1
  n = threads || default_parallel_threads(htmls.size)
  natives = Native::Document.parallel_parse(htmls, n)
  natives.each_with_index.map do |native, i|
    Document.new(htmls[i], native: native)
  end
end

.parse(html, base_url: nil, build_indexes: false) ⇒ `Object`

—– Parsing entry points —–

# File 'lib/scrapetor.rb', line 44

def self.parse(html, base_url: nil, build_indexes: false)
  if PersistentCache.enabled? && html.is_a?(String) && !html.empty?
    cached = PersistentCache.load(html)
    if cached
      doc = Document.new(html, base_url: base_url,
                         build_indexes: build_indexes, native: cached)
      return doc
    end
  end
  doc = Document.new(html, base_url: base_url, build_indexes: build_indexes)
  if PersistentCache.enabled? && html.is_a?(String) && !html.empty?
    PersistentCache.store(html, doc.backing.native) rescue nil
  end
  doc
end

.parse_file(path, base_url: nil) ⇒ `Object`



78
79
80

# File 'lib/scrapetor.rb', line 78

def self.parse_file(path, base_url: nil)
  parse(File.read(path), base_url: base_url)
end

.parse_fragment(html, base_url: nil) ⇒ `Object`



69
70
71

# File 'lib/scrapetor.rb', line 69

def self.parse_fragment(html, base_url: nil)
  parse(html, base_url: base_url)
end

.parse_html(html, base_url: nil) ⇒ `Object`



65
66
67

# File 'lib/scrapetor.rb', line 65

def self.parse_html(html, base_url: nil)
  parse(html, base_url: base_url)
end

.parse_io(io, base_url: nil) ⇒ `Object`

Parse from an arbitrary IO-like (responds to ‘read`) or a file path.



74
75
76

# File 'lib/scrapetor.rb', line 74

def self.parse_io(io, base_url: nil)
  parse(io.read, base_url: base_url)
end

.schema(&block) ⇒ `Object`

—– Extraction DSL —–



146
147
148

# File 'lib/scrapetor.rb', line 146

def self.schema(&block)
  Schema.build(&block)
end

.stream(source, outer:, fields: nil, chunk_size: Stream::DEFAULT_CHUNK, &block) ⇒ `Object`

# File 'lib/scrapetor/stream.rb', line 107

def self.stream(source, outer:, fields: nil, chunk_size: Stream::DEFAULT_CHUNK, &block)
  io = source.respond_to?(:read) ? source : StringIO.new(source)
  Stream.new(io, outer: outer, fields: fields, chunk_size: chunk_size).each(&block)
end

Module: Scrapetor

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.default_parallel_threads(n_items) ⇒ Object

.each_page(start_url, **opts, &block) ⇒ Object

.extract(html, schema = nil, base_url: nil, &block) ⇒ Object

.extract_file(path, schema, base_url: nil) ⇒ Object

.extract_native(html, schema, base_url: nil) ⇒ Object

.extract_ruby(html, schema, base_url: nil) ⇒ Object

.fetch(url, **opts) ⇒ Object

.fetch_extract(url, schema, **opts) ⇒ Object

.fetch_http2(url, **opts) ⇒ Object

.HTML(html, base_url = nil) ⇒ Object

.HTML5(*args, &block) ⇒ Object

.parallel_parse(htmls, threads: nil) ⇒ Object

.parse(html, base_url: nil, build_indexes: false) ⇒ Object

.parse_file(path, base_url: nil) ⇒ Object

.parse_fragment(html, base_url: nil) ⇒ Object

.parse_html(html, base_url: nil) ⇒ Object

.parse_io(io, base_url: nil) ⇒ Object

.schema(&block) ⇒ Object

.stream(source, outer:, fields: nil, chunk_size: Stream::DEFAULT_CHUNK, &block) ⇒ Object