Class: Crawlscope::Rules::Uniqueness

Inherits:
Object
  • Object
show all
Defined in:
lib/crawlscope/rules/uniqueness.rb

Constant Summary collapse

MINIMUM_SHINGLES =
10
MAX_NEAR_DUPLICATE_PAGES =
250
NEAR_DUPLICATE_THRESHOLD =
0.9
SHINGLE_SIZE =
5

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(near_duplicate_threshold: NEAR_DUPLICATE_THRESHOLD, max_near_duplicate_pages: MAX_NEAR_DUPLICATE_PAGES, minimum_shingles: MINIMUM_SHINGLES, shingle_size: SHINGLE_SIZE) ⇒ Uniqueness

Returns a new instance of Uniqueness.



15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/crawlscope/rules/uniqueness.rb', line 15

def initialize(
  near_duplicate_threshold: NEAR_DUPLICATE_THRESHOLD,
  max_near_duplicate_pages: MAX_NEAR_DUPLICATE_PAGES,
  minimum_shingles: MINIMUM_SHINGLES,
  shingle_size: SHINGLE_SIZE
)
  @code = :uniqueness
  @max_near_duplicate_pages = max_near_duplicate_pages
  @minimum_shingles = minimum_shingles
  @near_duplicate_threshold = near_duplicate_threshold
  @shingle_size = shingle_size
end

Instance Attribute Details

#codeObject (readonly)

Returns the value of attribute code.



13
14
15
# File 'lib/crawlscope/rules/uniqueness.rb', line 13

def code
  @code
end

Instance Method Details

#call(urls:, pages:, issues:, context:) ⇒ Object



28
29
30
31
32
33
34
35
36
37
# File 'lib/crawlscope/rules/uniqueness.rb', line 28

def call(urls:, pages:, issues:, context:)
  page_summaries = pages.filter_map do |page|
    next unless page.html?

    summary_for(page)
  end

  validate_duplicates(page_summaries, issues)
  validate_near_duplicates(page_summaries, issues)
end