Module: Crawlscope::DocumentText

Defined in:: lib/crawlscope/document_text.rb

Constant Summary collapse

REMOVED_SELECTORS =

"script, style, noscript, template, svg"

CONTENT_RATIO_REMOVED_SELECTORS =

"#{REMOVED_SELECTORS}, form"

TOKEN_PATTERN =

/[[:alnum:]]+/

Class Method Summary collapse

.body_text(doc) ⇒ Object
.content_ratio_html_for(doc, selector: "main") ⇒ Object
.html_for(doc, selector: "main") ⇒ Object
.normalize(text) ⇒ Object
.root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) ⇒ Object
.text_for(doc, selector: "main") ⇒ Object
.tokens(text) ⇒ Object

Class Method Details

.body_text(doc) ⇒ `Object`



11
12
13

# File 'lib/crawlscope/document_text.rb', line 11

def body_text(doc)
  text_for(doc, selector: nil)
end

.content_ratio_html_for(doc, selector: "main") ⇒ `Object`



19
20
21

# File 'lib/crawlscope/document_text.rb', line 19

def content_ratio_html_for(doc, selector: "main")
  root_for(doc, selector: selector, removed_selectors: CONTENT_RATIO_REMOVED_SELECTORS)&.to_html.to_s
end

.html_for(doc, selector: "main") ⇒ `Object`



15
16
17

# File 'lib/crawlscope/document_text.rb', line 15

def html_for(doc, selector: "main")
  root_for(doc, selector: selector)&.to_html.to_s
end

.normalize(text) ⇒ `Object`



31
32
33

# File 'lib/crawlscope/document_text.rb', line 31

def normalize(text)
  text.to_s.gsub(/\s+/, " ").strip
end

.root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) ⇒ `Object`

# File 'lib/crawlscope/document_text.rb', line 35

def root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS)
  return unless doc

  copy = doc.dup
  copy.css(removed_selectors).remove

  root = selector.to_s.empty? ? nil : copy.at_css(selector)
  root || copy.at_css("body") || copy
end

.text_for(doc, selector: "main") ⇒ `Object`



23
24
25

# File 'lib/crawlscope/document_text.rb', line 23

def text_for(doc, selector: "main")
  normalize(root_for(doc, selector: selector)&.text)
end

.tokens(text) ⇒ `Object`



27
28
29

# File 'lib/crawlscope/document_text.rb', line 27

def tokens(text)
  normalize(text).downcase.scan(TOKEN_PATTERN).reject { |token| token.length < 2 }
end

Module: Crawlscope::DocumentText

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.body_text(doc) ⇒ Object

.content_ratio_html_for(doc, selector: "main") ⇒ Object

.html_for(doc, selector: "main") ⇒ Object

.normalize(text) ⇒ Object

.root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) ⇒ Object

.text_for(doc, selector: "main") ⇒ Object

.tokens(text) ⇒ Object

.body_text(doc) ⇒ `Object`

.content_ratio_html_for(doc, selector: "main") ⇒ `Object`

.html_for(doc, selector: "main") ⇒ `Object`

.normalize(text) ⇒ `Object`

.root_for(doc, selector:, removed_selectors: REMOVED_SELECTORS) ⇒ `Object`

.text_for(doc, selector: "main") ⇒ `Object`

.tokens(text) ⇒ `Object`