Module: Scrapetor
- Defined in:
- lib/scrapetor.rb,
lib/scrapetor/dom.rb,
lib/scrapetor/sax.rb,
lib/scrapetor/url.rb,
lib/scrapetor/form.rb,
lib/scrapetor/http.rb,
lib/scrapetor/node.rb,
lib/scrapetor/money.rb,
lib/scrapetor/xpath.rb,
lib/scrapetor/errors.rb,
lib/scrapetor/native.rb,
lib/scrapetor/robots.rb,
lib/scrapetor/schema.rb,
lib/scrapetor/stream.rb,
lib/scrapetor/builder.rb,
lib/scrapetor/cleaner.rb,
lib/scrapetor/fetcher.rb,
lib/scrapetor/session.rb,
lib/scrapetor/sitemap.rb,
lib/scrapetor/version.rb,
lib/scrapetor/document.rb,
lib/scrapetor/encoding.rb,
lib/scrapetor/entities.rb,
lib/scrapetor/node_set.rb,
lib/scrapetor/selector.rb,
lib/scrapetor/extractor.rb,
lib/scrapetor/microdata.rb,
lib/scrapetor/page_type.rb,
lib/scrapetor/text_node.rb,
lib/scrapetor/dom/parser.rb,
lib/scrapetor/native_dom.rb,
lib/scrapetor/pagination.rb,
lib/scrapetor/fingerprint.rb,
lib/scrapetor/comment_node.rb,
lib/scrapetor/dom/selectors.rb,
lib/scrapetor/structured_data.rb,
lib/scrapetor/persistent_cache.rb,
lib/scrapetor/template_registry.rb,
ext/scrapetor/native/scrapetor_native.c
Defined Under Namespace
Modules: Cleaner, Dom, Encoding, Entities, Extractor, Fetcher, Fingerprint, HTML, HTML5, HTTP, Microdata, Money, Native, PageType, Pagination, PersistentCache, RDFa, SAX, Selector, Sitemap, StructuredData, URL, XPath Classes: Builder, CommentNode, Document, Error, ExtractionError, Form, Node, NodeSet, Robots, Schema, SchemaError, Session, Stream, TemplateRegistry, TextNode
Constant Summary collapse
- VERSION =
"0.2.0"
Class Method Summary collapse
- .default_parallel_threads(n_items) ⇒ Object
-
.each_page(start_url, **opts, &block) ⇒ Object
Top-level shorthand.
- .extract(html, schema = nil, base_url: nil, &block) ⇒ Object
-
.extract_file(path, schema, base_url: nil) ⇒ Object
Run an extraction schema directly against a file or IO.
-
.extract_native(html, schema, base_url: nil) ⇒ Object
Force the native streaming path.
-
.extract_ruby(html, schema, base_url: nil) ⇒ Object
Force the Ruby reference path.
-
.fetch(url, **opts) ⇒ Object
Module-level shortcut.
- .fetch_extract(url, schema, **opts) ⇒ Object
-
.fetch_http2(url, **opts) ⇒ Object
Top-level shorthand for the libcurl path.
-
.HTML(html, base_url = nil) ⇒ Object
‘Scrapetor::HTML(html)` — capital-H convenience method.
-
.HTML5(*args, &block) ⇒ Object
‘Scrapetor::HTML5(html)` — same parser, alternate name.
-
.parallel_parse(htmls, threads: nil) ⇒ Object
Parse N documents in parallel via native pthread workers, releasing the GVL for the duration.
-
.parse(html, base_url: nil, build_indexes: false) ⇒ Object
—– Parsing entry points —–.
- .parse_file(path, base_url: nil) ⇒ Object
- .parse_fragment(html, base_url: nil) ⇒ Object
- .parse_html(html, base_url: nil) ⇒ Object
-
.parse_io(io, base_url: nil) ⇒ Object
Parse from an arbitrary IO-like (responds to ‘read`) or a file path.
-
.schema(&block) ⇒ Object
—– Extraction DSL —–.
- .stream(source, outer:, fields: nil, chunk_size: Stream::DEFAULT_CHUNK, &block) ⇒ Object
Class Method Details
.default_parallel_threads(n_items) ⇒ Object
103 104 105 106 107 108 109 110 111 |
# File 'lib/scrapetor.rb', line 103 def self.default_parallel_threads(n_items) cpu = begin require "etc" Etc.nprocessors rescue StandardError 4 end [n_items, cpu].min end |
.each_page(start_url, **opts, &block) ⇒ Object
Top-level shorthand.
106 107 108 |
# File 'lib/scrapetor/pagination.rb', line 106 def self.each_page(start_url, **opts, &block) Pagination.each_page(start_url, **opts, &block) end |
.extract(html, schema = nil, base_url: nil, &block) ⇒ Object
150 151 152 |
# File 'lib/scrapetor.rb', line 150 def self.extract(html, schema = nil, base_url: nil, &block) parse(html, base_url: base_url).extract(schema, &block) end |
.extract_file(path, schema, base_url: nil) ⇒ Object
Run an extraction schema directly against a file or IO.
114 115 116 |
# File 'lib/scrapetor.rb', line 114 def self.extract_file(path, schema, base_url: nil) extract(File.read(path), schema, base_url: base_url) end |
.extract_native(html, schema, base_url: nil) ⇒ Object
Force the native streaming path. Raises if the schema can’t compile.
155 156 157 158 159 160 |
# File 'lib/scrapetor.rb', line 155 def self.extract_native(html, schema, base_url: nil) raise Error, "native extension not loaded" unless Native.available? desc = Native.compile_descriptor(schema) raise Error, "schema not native-compilable" unless desc Native.extract(html.to_s, desc, base_url) end |
.extract_ruby(html, schema, base_url: nil) ⇒ Object
Force the Ruby reference path. Useful for parity tests + benchmarks.
163 164 165 166 |
# File 'lib/scrapetor.rb', line 163 def self.extract_ruby(html, schema, base_url: nil) doc = parse(html, base_url: base_url) Extractor.run(doc, doc.backing, schema) end |
.fetch(url, **opts) ⇒ Object
Module-level shortcut. Most users only want this.
107 108 109 |
# File 'lib/scrapetor/http.rb', line 107 def self.fetch(url, **opts) HTTP.fetch(url, **opts) end |
.fetch_extract(url, schema, **opts) ⇒ Object
111 112 113 |
# File 'lib/scrapetor/http.rb', line 111 def self.fetch_extract(url, schema, **opts) HTTP.fetch_extract(url, schema, **opts) end |
.fetch_http2(url, **opts) ⇒ Object
Top-level shorthand for the libcurl path. Distinct from Scrapetor.fetch (Net::HTTP) so callers can opt-in to HTTP/2 + connection reuse where it’s actually available.
387 388 389 |
# File 'lib/scrapetor/fetcher.rb', line 387 def self.fetch_http2(url, **opts) Fetcher.fetch(url, **opts) end |
.HTML(html, base_url = nil) ⇒ Object
‘Scrapetor::HTML(html)` — capital-H convenience method.
61 62 63 |
# File 'lib/scrapetor.rb', line 61 def self.HTML(html, base_url = nil) parse(html, base_url: base_url) end |
.HTML5(*args, &block) ⇒ Object
‘Scrapetor::HTML5(html)` — same parser, alternate name.
119 120 121 |
# File 'lib/scrapetor.rb', line 119 def self.HTML5(*args, &block) parse(*args, &block) end |
.parallel_parse(htmls, threads: nil) ⇒ Object
Parse N documents in parallel via native pthread workers, releasing the GVL for the duration. Returns Array<Scrapetor::Document> in the same order as the input. Skips the in-memory parse cache (which is GVL-bound); use single-document Scrapetor.parse for cache-friendly workloads.
Use this for batch jobs over distinct documents where parsing dominates: pre-warming a fixture corpus, indexing a crawl, A/B comparing parsed shapes. Falls through to a serial parse when only one document is provided.
92 93 94 95 96 97 98 99 100 101 |
# File 'lib/scrapetor.rb', line 92 def self.parallel_parse(htmls, threads: nil) htmls = Array(htmls) return [] if htmls.empty? return [parse(htmls.first)] if htmls.size == 1 n = threads || default_parallel_threads(htmls.size) natives = Native::Document.parallel_parse(htmls, n) natives.each_with_index.map do |native, i| Document.new(htmls[i], native: native) end end |
.parse(html, base_url: nil, build_indexes: false) ⇒ Object
—– Parsing entry points —–
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/scrapetor.rb', line 44 def self.parse(html, base_url: nil, build_indexes: false) if PersistentCache.enabled? && html.is_a?(String) && !html.empty? cached = PersistentCache.load(html) if cached doc = Document.new(html, base_url: base_url, build_indexes: build_indexes, native: cached) return doc end end doc = Document.new(html, base_url: base_url, build_indexes: build_indexes) if PersistentCache.enabled? && html.is_a?(String) && !html.empty? PersistentCache.store(html, doc.backing.native) rescue nil end doc end |
.parse_file(path, base_url: nil) ⇒ Object
78 79 80 |
# File 'lib/scrapetor.rb', line 78 def self.parse_file(path, base_url: nil) parse(File.read(path), base_url: base_url) end |
.parse_fragment(html, base_url: nil) ⇒ Object
69 70 71 |
# File 'lib/scrapetor.rb', line 69 def self.parse_fragment(html, base_url: nil) parse(html, base_url: base_url) end |
.parse_html(html, base_url: nil) ⇒ Object
65 66 67 |
# File 'lib/scrapetor.rb', line 65 def self.parse_html(html, base_url: nil) parse(html, base_url: base_url) end |
.parse_io(io, base_url: nil) ⇒ Object
Parse from an arbitrary IO-like (responds to ‘read`) or a file path.
74 75 76 |
# File 'lib/scrapetor.rb', line 74 def self.parse_io(io, base_url: nil) parse(io.read, base_url: base_url) end |
.schema(&block) ⇒ Object
—– Extraction DSL —–
146 147 148 |
# File 'lib/scrapetor.rb', line 146 def self.schema(&block) Schema.build(&block) end |
.stream(source, outer:, fields: nil, chunk_size: Stream::DEFAULT_CHUNK, &block) ⇒ Object
107 108 109 110 |
# File 'lib/scrapetor/stream.rb', line 107 def self.stream(source, outer:, fields: nil, chunk_size: Stream::DEFAULT_CHUNK, &block) io = source.respond_to?(:read) ? source : StringIO.new(source) Stream.new(io, outer: outer, fields: fields, chunk_size: chunk_size).each(&block) end |