Module: Crawlscope::DocumentText
- Defined in:
- lib/crawlscope/document_text.rb
Constant Summary collapse
- REMOVED_SELECTORS =
"script, style, noscript, template, svg"- CONTENT_RATIO_REMOVED_SELECTORS =
"#{REMOVED_SELECTORS}, form"- TOKEN_PATTERN =
/[[:alnum:]]+/
Class Method Summary collapse
- .body_text(doc) ⇒ Object
- .content_ratio_html_for(doc, selector: "main") ⇒ Object
- .html_for(doc, selector: "main") ⇒ Object
- .normalize(text) ⇒ Object
- .root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) ⇒ Object
- .text_for(doc, selector: "main") ⇒ Object
- .tokens(text) ⇒ Object
Class Method Details
.body_text(doc) ⇒ Object
11 12 13 |
# File 'lib/crawlscope/document_text.rb', line 11 def body_text(doc) text_for(doc, selector: nil) end |
.content_ratio_html_for(doc, selector: "main") ⇒ Object
19 20 21 |
# File 'lib/crawlscope/document_text.rb', line 19 def content_ratio_html_for(doc, selector: "main") root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s end |
.html_for(doc, selector: "main") ⇒ Object
15 16 17 |
# File 'lib/crawlscope/document_text.rb', line 15 def html_for(doc, selector: "main") root_for(doc, selector: selector)&.to_html.to_s end |
.normalize(text) ⇒ Object
31 32 33 |
# File 'lib/crawlscope/document_text.rb', line 31 def normalize(text) text.to_s.gsub(/\s+/, " ").strip end |
.root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) ⇒ Object
35 36 37 38 39 40 41 42 43 |
# File 'lib/crawlscope/document_text.rb', line 35 def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) return unless doc copy = doc.dup copy.css(removed_selectors).remove root = selector.to_s.empty? ? nil : copy.at_css(selector) root || copy.at_css("body") || copy end |
.text_for(doc, selector: "main") ⇒ Object
23 24 25 |
# File 'lib/crawlscope/document_text.rb', line 23 def text_for(doc, selector: "main") normalize(root_for(doc, selector: selector)&.text) end |
.tokens(text) ⇒ Object
27 28 29 |
# File 'lib/crawlscope/document_text.rb', line 27 def tokens(text) normalize(text).downcase.scan(TOKEN_PATTERN).reject { |token| token.length < 2 } end |