Module: Crawlscope::DocumentText

Defined in:
lib/crawlscope/document_text.rb

Constant Summary collapse

REMOVED_SELECTORS =
"script, style, noscript, template, svg"
TOKEN_PATTERN =
/[[:alnum:]]+/

Class Method Summary collapse

Class Method Details

.body_text(doc) ⇒ Object



10
11
12
# File 'lib/crawlscope/document_text.rb', line 10

def body_text(doc)
  text_for(doc, selector: nil)
end

.html_for(doc, selector: "main") ⇒ Object



14
15
16
# File 'lib/crawlscope/document_text.rb', line 14

def html_for(doc, selector: "main")
  root_for(doc, selector: selector)&.to_html.to_s
end

.normalize(text) ⇒ Object



26
27
28
# File 'lib/crawlscope/document_text.rb', line 26

def normalize(text)
  text.to_s.gsub(/\s+/, " ").strip
end

.root_for(doc, selector:) ⇒ Object



30
31
32
33
34
35
36
37
38
# File 'lib/crawlscope/document_text.rb', line 30

def root_for(doc, selector:)
  return unless doc

  copy = doc.dup
  copy.css(REMOVED_SELECTORS).remove

  root = selector.to_s.empty? ? nil : copy.at_css(selector)
  root || copy.at_css("body") || copy
end

.text_for(doc, selector: "main") ⇒ Object



18
19
20
# File 'lib/crawlscope/document_text.rb', line 18

def text_for(doc, selector: "main")
  normalize(root_for(doc, selector: selector)&.text)
end

.tokens(text) ⇒ Object



22
23
24
# File 'lib/crawlscope/document_text.rb', line 22

def tokens(text)
  normalize(text).downcase.scan(TOKEN_PATTERN).reject { |token| token.length < 2 }
end