Module: Scrapetor::Extractor

Defined in:
lib/scrapetor/extractor.rb

Overview

Schema execution. The hot path operates on raw Nokolexbor nodes and inlines coercion — no Scrapetor::Node allocations per emitted field.

Class Method Summary collapse

Class Method Details

.bool_coerce(v) ⇒ Object



135
136
137
138
139
140
141
# File 'lib/scrapetor/extractor.rb', line 135

def self.bool_coerce(v)
  s = v.to_s.strip.downcase
  return true  if TRUTHY_STRINGS.include?(s)
  return false if FALSY_STRINGS.include?(s)
  return true  if s == "yes"
  nil
end

.coerce(raw, f, base_url, _node) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/scrapetor/extractor.rb', line 89

def self.coerce(raw, f, base_url, _node)
  return nil if raw.nil?
  v = raw
  v = Cleaner.clean(v) if f.clean
  case f.type
  when :text     then v
  when :integer  then int_coerce(v)
  when :float    then float_coerce(v)
  when :money    then Money.parse(v)
  when :url      then f.normalize_url ? URL.absolute(v, base_url) : v
  when :date     then date_coerce(v)
  when :json     then json_coerce(v)
  when :boolean  then bool_coerce(v)
  when :list     then list_coerce(v, f.delimiter)
  when :html     then v # already inner_html
  else v
  end
end

.date_coerce(v) ⇒ Object



118
119
120
121
122
123
# File 'lib/scrapetor/extractor.rb', line 118

def self.date_coerce(v)
  require "date"
  ::Date.parse(v.to_s)
rescue ::ArgumentError, ::TypeError
  nil
end

.extract_field(scope, f, base_url) ⇒ Object



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/scrapetor/extractor.rb', line 32

def self.extract_field(scope, f, base_url)
  selectors = f.selector.is_a?(Array) ? f.selector : [f.selector]
  value =
    if f.multi
      extract_multi(scope, selectors, f, base_url)
    else
      extract_single(scope, selectors, f, base_url)
    end

  # default + required
  missing = value.nil? || (f.multi && value.empty?)
  value = f.default if missing && !f.default.nil?

  if f.required && (value.nil? || (f.multi && value.respond_to?(:empty?) && value.empty?))
    raise ExtractionError, "required field `#{f.name}` not found"
  end

  # transform last (after coerce + default)
  value = f.transform.call(value) if f.transform && !value.nil?
  value
end

.extract_multi(scope, selectors, f, base_url) ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/scrapetor/extractor.rb', line 65

def self.extract_multi(scope, selectors, f, base_url)
  out = []
  selectors.each do |sel|
    nodes = sel ? scope.css(sel) : [scope]
    nodes.each do |n|
      raw = extract_raw(n, f)
      next if raw.nil?
      v = coerce(raw, f, base_url, n)
      out << v unless v.nil?
    end
    break unless out.empty?
  end
  out
end

.extract_raw(node, f) ⇒ Object



80
81
82
83
84
85
86
87
# File 'lib/scrapetor/extractor.rb', line 80

def self.extract_raw(node, f)
  return node.inner_html if f.type == :html && f.attr_str.nil?
  if f.attr_str
    node[f.attr_str]
  else
    node.text
  end
end

.extract_single(scope, selectors, f, base_url) ⇒ Object



54
55
56
57
58
59
60
61
62
63
# File 'lib/scrapetor/extractor.rb', line 54

def self.extract_single(scope, selectors, f, base_url)
  selectors.each do |sel|
    n = sel ? scope.at_css(sel) : scope
    next if n.nil?
    raw = extract_raw(n, f)
    next if raw.nil?
    return coerce(raw, f, base_url, n)
  end
  nil
end

.float_coerce(v) ⇒ Object



113
114
115
116
# File 'lib/scrapetor/extractor.rb', line 113

def self.float_coerce(v)
  s = v.to_s.gsub(/[^\d.\-]/, "")
  s.empty? || s == "-" || s == "." ? nil : s.to_f
end

.int_coerce(v) ⇒ Object



108
109
110
111
# File 'lib/scrapetor/extractor.rb', line 108

def self.int_coerce(v)
  s = v.to_s.gsub(/[^\d\-]/, "")
  s.empty? || s == "-" ? nil : s.to_i
end

.json_coerce(v) ⇒ Object



125
126
127
128
129
# File 'lib/scrapetor/extractor.rb', line 125

def self.json_coerce(v)
  JSON.parse(v.to_s)
rescue JSON::ParserError
  nil
end

.list_coerce(v, delimiter) ⇒ Object



143
144
145
# File 'lib/scrapetor/extractor.rb', line 143

def self.list_coerce(v, delimiter)
  v.to_s.split(delimiter).map(&:strip).reject(&:empty?)
end

.run(doc, scope, schema) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
# File 'lib/scrapetor/extractor.rb', line 9

def self.run(doc, scope, schema)
  result = {}
  base_url = doc.respond_to?(:base_url) ? doc.base_url : nil
  schema.fields.each do |f|
    result[f.name] = extract_field(scope, f, base_url)
  end
  schema.groups.each do |g|
    result[g.name] = run_group(scope, g, base_url)
  end
  result
end

.run_group(scope, group, base_url) ⇒ Object



21
22
23
24
25
26
27
28
29
30
# File 'lib/scrapetor/extractor.rb', line 21

def self.run_group(scope, group, base_url)
  out = []
  scope.css(group.selector).each do |sub|
    inner = {}
    group.fields.each { |f| inner[f.name] = extract_field(sub, f, base_url) }
    group.groups.each { |gg| inner[gg.name] = run_group(sub, gg, base_url) }
    out << inner
  end
  out
end