Module: Coelacanth::Extractor::Utilities
- Defined in:
- lib/coelacanth/extractor/utilities.rb
Overview
Shared helpers for the extractor pipeline.
Constant Summary collapse
- PUNCTUATION =
%w[。 、 . ・ . , ! ? : ; ; :]
Class Method Summary collapse
- .absolute_url(base_url, path) ⇒ Object
- .ancestors(node) ⇒ Object
- .class_id_tokens(node) ⇒ Object
- .collect_ancestors(node) ⇒ Object
- .depth(node) ⇒ Object
- .element?(node) ⇒ Boolean
- .element_children(node) ⇒ Object
- .link_density(node) ⇒ Object
- .link_text_length(node) ⇒ Object
- .meta_content(doc, *selectors) ⇒ Object
- .next_element(node) ⇒ Object
- .parse_time(value) ⇒ Object
- .previous_element(node) ⇒ Object
- .punctuation_density(node) ⇒ Object
- .sibling_elements(node) ⇒ Object
- .split_tokens(value) ⇒ Object
- .text_length(node) ⇒ Object
Class Method Details
.absolute_url(base_url, path) ⇒ Object
135 136 137 138 139 140 141 142 |
# File 'lib/coelacanth/extractor/utilities.rb', line 135 def absolute_url(base_url, path) return if path.nil? || path.empty? return path if path =~ /^https?:/i URI.join(base_url, path).to_s rescue URI::Error path end |
.ancestors(node) ⇒ Object
43 44 45 46 47 48 49 50 51 |
# File 'lib/coelacanth/extractor/utilities.rb', line 43 def ancestors(node) return [] unless node if node.respond_to?(:ancestors) Array(node.ancestors) else collect_ancestors(node) end end |
.class_id_tokens(node) ⇒ Object
107 108 109 110 111 112 |
# File 'lib/coelacanth/extractor/utilities.rb', line 107 def class_id_tokens(node) tokens = [] tokens.concat(split_tokens(node[:class])) if node[:class] tokens.concat(split_tokens(node[:id])) if node[:id] tokens end |
.collect_ancestors(node) ⇒ Object
53 54 55 56 57 58 59 60 61 62 |
# File 'lib/coelacanth/extractor/utilities.rb', line 53 def collect_ancestors(node) ancestors = [] current = node while current.respond_to?(:parent) && (current = current.parent) ancestors << current end ancestors end |
.depth(node) ⇒ Object
39 40 41 |
# File 'lib/coelacanth/extractor/utilities.rb', line 39 def depth(node) ancestors(node).length end |
.element?(node) ⇒ Boolean
64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/coelacanth/extractor/utilities.rb', line 64 def element?(node) return false unless node if node.respond_to?(:element?) node.element? elsif defined?(::Oga::XML::Element) && node.is_a?(::Oga::XML::Element) true elsif node.respond_to?(:type) node.type == :element else false end end |
.element_children(node) ⇒ Object
78 79 80 81 82 |
# File 'lib/coelacanth/extractor/utilities.rb', line 78 def element_children(node) return [] unless node.respond_to?(:children) node.children.select { |child| element?(child) } end |
.link_density(node) ⇒ Object
32 33 34 35 36 37 |
# File 'lib/coelacanth/extractor/utilities.rb', line 32 def link_density(node) length = text_length(node) return 0.0 if length.zero? link_text_length(node).to_f / length end |
.link_text_length(node) ⇒ Object
18 19 20 21 22 |
# File 'lib/coelacanth/extractor/utilities.rb', line 18 def link_text_length(node) return 0 unless node node.css("a").sum { |anchor| anchor.text.strip.length } end |
.meta_content(doc, *selectors) ⇒ Object
118 119 120 121 122 123 124 125 |
# File 'lib/coelacanth/extractor/utilities.rb', line 118 def (doc, *selectors) selectors.each do |selector| if (node = doc.at_css(selector)) return node["content"].to_s.strip unless node["content"].to_s.strip.empty? end end nil end |
.next_element(node) ⇒ Object
99 100 101 102 103 104 105 |
# File 'lib/coelacanth/extractor/utilities.rb', line 99 def next_element(node) siblings = sibling_elements(node) index = siblings.index(node) return unless index siblings[index + 1] end |
.parse_time(value) ⇒ Object
127 128 129 130 131 132 133 |
# File 'lib/coelacanth/extractor/utilities.rb', line 127 def parse_time(value) return if value.nil? || value.empty? Time.parse(value) rescue ArgumentError nil end |
.previous_element(node) ⇒ Object
91 92 93 94 95 96 97 |
# File 'lib/coelacanth/extractor/utilities.rb', line 91 def previous_element(node) siblings = sibling_elements(node) index = siblings.index(node) return unless index && index.positive? siblings[index - 1] end |
.punctuation_density(node) ⇒ Object
24 25 26 27 28 29 30 |
# File 'lib/coelacanth/extractor/utilities.rb', line 24 def punctuation_density(node) length = text_length(node) return 0.0 if length.zero? count = node.text.chars.count { |char| PUNCTUATION.include?(char) } count.to_f / length end |
.sibling_elements(node) ⇒ Object
84 85 86 87 88 89 |
# File 'lib/coelacanth/extractor/utilities.rb', line 84 def sibling_elements(node) parent = node.respond_to?(:parent) ? node.parent : nil return [] unless parent element_children(parent) end |
.split_tokens(value) ⇒ Object
114 115 116 |
# File 'lib/coelacanth/extractor/utilities.rb', line 114 def split_tokens(value) value.to_s.split(/[\s_-]+/).map(&:downcase) end |
.text_length(node) ⇒ Object
14 15 16 |
# File 'lib/coelacanth/extractor/utilities.rb', line 14 def text_length(node) node&.text&.strip&.length.to_i end |