Module: Crawlscope::DocumentText

Defined in:
lib/crawlscope/document_text.rb

Constant Summary collapse

REMOVED_SELECTORS =
"script, style, noscript, template, svg"
CONTENT_RATIO_REMOVED_SELECTORS =
"#{REMOVED_SELECTORS}, form"
TOKEN_PATTERN =
/[[:alnum:]]+/

Class Method Summary collapse

Class Method Details

.body_text(doc) ⇒ Object



11
12
13
# File 'lib/crawlscope/document_text.rb', line 11

def body_text(doc)
  text_for(doc, selector: nil)
end

.content_ratio_html_for(doc, selector: "main") ⇒ Object



19
20
21
# File 'lib/crawlscope/document_text.rb', line 19

def content_ratio_html_for(doc, selector: "main")
  root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s
end

.html_for(doc, selector: "main") ⇒ Object



15
16
17
# File 'lib/crawlscope/document_text.rb', line 15

def html_for(doc, selector: "main")
  root_for(doc, selector: selector)&.to_html.to_s
end

.normalize(text) ⇒ Object



31
32
33
# File 'lib/crawlscope/document_text.rb', line 31

def normalize(text)
  text.to_s.gsub(/\s+/, " ").strip
end

.root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) ⇒ Object



35
36
37
38
39
40
41
42
43
# File 'lib/crawlscope/document_text.rb', line 35

def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS)
  return unless doc

  copy = doc.dup
  copy.css(removed_selectors).remove

  root = selector.to_s.empty? ? nil : copy.at_css(selector)
  root || copy.at_css("body") || copy
end

.text_for(doc, selector: "main") ⇒ Object



23
24
25
# File 'lib/crawlscope/document_text.rb', line 23

def text_for(doc, selector: "main")
  normalize(root_for(doc, selector: selector)&.text)
end

.tokens(text) ⇒ Object



27
28
29
# File 'lib/crawlscope/document_text.rb', line 27

def tokens(text)
  normalize(text).downcase.scan(TOKEN_PATTERN).reject { |token| token.length < 2 }
end