Module: Crawlscope::DocumentText
- Defined in:
- lib/crawlscope/document_text.rb
Constant Summary collapse
- REMOVED_SELECTORS =
"script, style, noscript, template, svg"- TOKEN_PATTERN =
/[[:alnum:]]+/
Class Method Summary collapse
- .body_text(doc) ⇒ Object
- .html_for(doc, selector: "main") ⇒ Object
- .normalize(text) ⇒ Object
- .root_for(doc, selector:) ⇒ Object
- .text_for(doc, selector: "main") ⇒ Object
- .tokens(text) ⇒ Object
Class Method Details
.body_text(doc) ⇒ Object
10 11 12 |
# File 'lib/crawlscope/document_text.rb', line 10 def body_text(doc) text_for(doc, selector: nil) end |
.html_for(doc, selector: "main") ⇒ Object
14 15 16 |
# File 'lib/crawlscope/document_text.rb', line 14 def html_for(doc, selector: "main") root_for(doc, selector: selector)&.to_html.to_s end |
.normalize(text) ⇒ Object
26 27 28 |
# File 'lib/crawlscope/document_text.rb', line 26 def normalize(text) text.to_s.gsub(/\s+/, " ").strip end |
.root_for(doc, selector:) ⇒ Object
30 31 32 33 34 35 36 37 38 |
# File 'lib/crawlscope/document_text.rb', line 30 def root_for(doc, selector:) return unless doc copy = doc.dup copy.css(REMOVED_SELECTORS).remove root = selector.to_s.empty? ? nil : copy.at_css(selector) root || copy.at_css("body") || copy end |
.text_for(doc, selector: "main") ⇒ Object
18 19 20 |
# File 'lib/crawlscope/document_text.rb', line 18 def text_for(doc, selector: "main") normalize(root_for(doc, selector: selector)&.text) end |
.tokens(text) ⇒ Object
22 23 24 |
# File 'lib/crawlscope/document_text.rb', line 22 def tokens(text) normalize(text).downcase.scan(TOKEN_PATTERN).reject { |token| token.length < 2 } end |