Module: Scrapetor::Extractor
- Defined in:
- lib/scrapetor/extractor.rb
Overview
Schema execution. The hot path operates on raw Nokolexbor nodes and inlines coercion — no Scrapetor::Node allocations per emitted field.
Class Method Summary collapse
- .bool_coerce(v) ⇒ Object
- .coerce(raw, f, base_url, _node) ⇒ Object
- .date_coerce(v) ⇒ Object
- .extract_field(scope, f, base_url) ⇒ Object
- .extract_multi(scope, selectors, f, base_url) ⇒ Object
- .extract_raw(node, f) ⇒ Object
- .extract_single(scope, selectors, f, base_url) ⇒ Object
- .float_coerce(v) ⇒ Object
- .int_coerce(v) ⇒ Object
- .json_coerce(v) ⇒ Object
- .list_coerce(v, delimiter) ⇒ Object
- .run(doc, scope, schema) ⇒ Object
- .run_group(scope, group, base_url) ⇒ Object
Class Method Details
.bool_coerce(v) ⇒ Object
135 136 137 138 139 140 141 |
# File 'lib/scrapetor/extractor.rb', line 135 def self.bool_coerce(v) s = v.to_s.strip.downcase return true if TRUTHY_STRINGS.include?(s) return false if FALSY_STRINGS.include?(s) return true if s == "yes" nil end |
.coerce(raw, f, base_url, _node) ⇒ Object
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/scrapetor/extractor.rb', line 89 def self.coerce(raw, f, base_url, _node) return nil if raw.nil? v = raw v = Cleaner.clean(v) if f.clean case f.type when :text then v when :integer then int_coerce(v) when :float then float_coerce(v) when :money then Money.parse(v) when :url then f.normalize_url ? URL.absolute(v, base_url) : v when :date then date_coerce(v) when :json then json_coerce(v) when :boolean then bool_coerce(v) when :list then list_coerce(v, f.delimiter) when :html then v # already inner_html else v end end |
.date_coerce(v) ⇒ Object
118 119 120 121 122 123 |
# File 'lib/scrapetor/extractor.rb', line 118 def self.date_coerce(v) require "date" ::Date.parse(v.to_s) rescue ::ArgumentError, ::TypeError nil end |
.extract_field(scope, f, base_url) ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/scrapetor/extractor.rb', line 32 def self.extract_field(scope, f, base_url) selectors = f.selector.is_a?(Array) ? f.selector : [f.selector] value = if f.multi extract_multi(scope, selectors, f, base_url) else extract_single(scope, selectors, f, base_url) end # default + required missing = value.nil? || (f.multi && value.empty?) value = f.default if missing && !f.default.nil? if f.required && (value.nil? || (f.multi && value.respond_to?(:empty?) && value.empty?)) raise ExtractionError, "required field `#{f.name}` not found" end # transform last (after coerce + default) value = f.transform.call(value) if f.transform && !value.nil? value end |
.extract_multi(scope, selectors, f, base_url) ⇒ Object
65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'lib/scrapetor/extractor.rb', line 65 def self.extract_multi(scope, selectors, f, base_url) out = [] selectors.each do |sel| nodes = sel ? scope.css(sel) : [scope] nodes.each do |n| raw = extract_raw(n, f) next if raw.nil? v = coerce(raw, f, base_url, n) out << v unless v.nil? end break unless out.empty? end out end |
.extract_raw(node, f) ⇒ Object
80 81 82 83 84 85 86 87 |
# File 'lib/scrapetor/extractor.rb', line 80 def self.extract_raw(node, f) return node.inner_html if f.type == :html && f.attr_str.nil? if f.attr_str node[f.attr_str] else node.text end end |
.extract_single(scope, selectors, f, base_url) ⇒ Object
54 55 56 57 58 59 60 61 62 63 |
# File 'lib/scrapetor/extractor.rb', line 54 def self.extract_single(scope, selectors, f, base_url) selectors.each do |sel| n = sel ? scope.at_css(sel) : scope next if n.nil? raw = extract_raw(n, f) next if raw.nil? return coerce(raw, f, base_url, n) end nil end |
.float_coerce(v) ⇒ Object
113 114 115 116 |
# File 'lib/scrapetor/extractor.rb', line 113 def self.float_coerce(v) s = v.to_s.gsub(/[^\d.\-]/, "") s.empty? || s == "-" || s == "." ? nil : s.to_f end |
.int_coerce(v) ⇒ Object
108 109 110 111 |
# File 'lib/scrapetor/extractor.rb', line 108 def self.int_coerce(v) s = v.to_s.gsub(/[^\d\-]/, "") s.empty? || s == "-" ? nil : s.to_i end |
.json_coerce(v) ⇒ Object
125 126 127 128 129 |
# File 'lib/scrapetor/extractor.rb', line 125 def self.json_coerce(v) JSON.parse(v.to_s) rescue JSON::ParserError nil end |
.list_coerce(v, delimiter) ⇒ Object
143 144 145 |
# File 'lib/scrapetor/extractor.rb', line 143 def self.list_coerce(v, delimiter) v.to_s.split(delimiter).map(&:strip).reject(&:empty?) end |
.run(doc, scope, schema) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/scrapetor/extractor.rb', line 9 def self.run(doc, scope, schema) result = {} base_url = doc.respond_to?(:base_url) ? doc.base_url : nil schema.fields.each do |f| result[f.name] = extract_field(scope, f, base_url) end schema.groups.each do |g| result[g.name] = run_group(scope, g, base_url) end result end |
.run_group(scope, group, base_url) ⇒ Object
21 22 23 24 25 26 27 28 29 30 |
# File 'lib/scrapetor/extractor.rb', line 21 def self.run_group(scope, group, base_url) out = [] scope.css(group.selector).each do |sub| inner = {} group.fields.each { |f| inner[f.name] = extract_field(sub, f, base_url) } group.groups.each { |gg| inner[gg.name] = run_group(sub, gg, base_url) } out << inner end out end |