Module: Coelacanth::Extractor::Utilities

Defined in:
lib/coelacanth/extractor/utilities.rb

Overview

Shared helpers for the extractor pipeline.

Constant Summary collapse

PUNCTUATION =
%w[    . , ! ?   ; :]

Class Method Summary collapse

Class Method Details

.absolute_url(base_url, path) ⇒ Object



135
136
137
138
139
140
141
142
# File 'lib/coelacanth/extractor/utilities.rb', line 135

def absolute_url(base_url, path)
  return if path.nil? || path.empty?
  return path if path =~ /^https?:/i

  URI.join(base_url, path).to_s
rescue URI::Error
  path
end

.ancestors(node) ⇒ Object



43
44
45
46
47
48
49
50
51
# File 'lib/coelacanth/extractor/utilities.rb', line 43

def ancestors(node)
  return [] unless node

  if node.respond_to?(:ancestors)
    Array(node.ancestors)
  else
    collect_ancestors(node)
  end
end

.class_id_tokens(node) ⇒ Object



107
108
109
110
111
112
# File 'lib/coelacanth/extractor/utilities.rb', line 107

def class_id_tokens(node)
  tokens = []
  tokens.concat(split_tokens(node[:class])) if node[:class]
  tokens.concat(split_tokens(node[:id])) if node[:id]
  tokens
end

.collect_ancestors(node) ⇒ Object



53
54
55
56
57
58
59
60
61
62
# File 'lib/coelacanth/extractor/utilities.rb', line 53

def collect_ancestors(node)
  ancestors = []
  current = node

  while current.respond_to?(:parent) && (current = current.parent)
    ancestors << current
  end

  ancestors
end

.depth(node) ⇒ Object



39
40
41
# File 'lib/coelacanth/extractor/utilities.rb', line 39

def depth(node)
  ancestors(node).length
end

.element?(node) ⇒ Boolean

Returns:

  • (Boolean)


64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/coelacanth/extractor/utilities.rb', line 64

def element?(node)
  return false unless node

  if node.respond_to?(:element?)
    node.element?
  elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element)
    true
  elsif node.respond_to?(:type)
    node.type == :element
  else
    false
  end
end

.element_children(node) ⇒ Object



78
79
80
81
82
# File 'lib/coelacanth/extractor/utilities.rb', line 78

def element_children(node)
  return [] unless node.respond_to?(:children)

  node.children.select { |child| element?(child) }
end


32
33
34
35
36
37
# File 'lib/coelacanth/extractor/utilities.rb', line 32

def link_density(node)
  length = text_length(node)
  return 0.0 if length.zero?

  link_text_length(node).to_f / length
end


18
19
20
21
22
# File 'lib/coelacanth/extractor/utilities.rb', line 18

def link_text_length(node)
  return 0 unless node

  node.css("a").sum { |anchor| anchor.text.strip.length }
end

.meta_content(doc, *selectors) ⇒ Object



118
119
120
121
122
123
124
125
# File 'lib/coelacanth/extractor/utilities.rb', line 118

def meta_content(doc, *selectors)
  selectors.each do |selector|
    if (node = doc.at_css(selector))
      return node["content"].to_s.strip unless node["content"].to_s.strip.empty?
    end
  end
  nil
end

.next_element(node) ⇒ Object



99
100
101
102
103
104
105
# File 'lib/coelacanth/extractor/utilities.rb', line 99

def next_element(node)
  siblings = sibling_elements(node)
  index = siblings.index(node)
  return unless index

  siblings[index + 1]
end

.parse_time(value) ⇒ Object



127
128
129
130
131
132
133
# File 'lib/coelacanth/extractor/utilities.rb', line 127

def parse_time(value)
  return if value.nil? || value.empty?

  Time.parse(value)
rescue ArgumentError
  nil
end

.previous_element(node) ⇒ Object



91
92
93
94
95
96
97
# File 'lib/coelacanth/extractor/utilities.rb', line 91

def previous_element(node)
  siblings = sibling_elements(node)
  index = siblings.index(node)
  return unless index && index.positive?

  siblings[index - 1]
end

.punctuation_density(node) ⇒ Object



24
25
26
27
28
29
30
# File 'lib/coelacanth/extractor/utilities.rb', line 24

def punctuation_density(node)
  length = text_length(node)
  return 0.0 if length.zero?

  count = node.text.chars.count { |char| PUNCTUATION.include?(char) }
  count.to_f / length
end

.sibling_elements(node) ⇒ Object



84
85
86
87
88
89
# File 'lib/coelacanth/extractor/utilities.rb', line 84

def sibling_elements(node)
  parent = node.respond_to?(:parent) ? node.parent : nil
  return [] unless parent

  element_children(parent)
end

.split_tokens(value) ⇒ Object



114
115
116
# File 'lib/coelacanth/extractor/utilities.rb', line 114

def split_tokens(value)
  value.to_s.split(/[\s_-]+/).map(&:downcase)
end

.text_length(node) ⇒ Object



14
15
16
# File 'lib/coelacanth/extractor/utilities.rb', line 14

def text_length(node)
  node&.text&.strip&.length.to_i
end