Class: Guardrails::PartialSimilarity

Inherits:
Object
  • Object
show all
Defined in:
lib/guardrails/partial_similarity.rb

Defined Under Namespace

Classes: Finding

Constant Summary collapse

DEFAULT_THRESHOLD =
0.7
DEFAULT_NGRAM_SIZE =
3
MIN_TAGS =
5
PARTIAL_PATTERNS =

Scan ERB partials (underscore-prefixed in app/views and app/components) AND ViewComponent sidecar templates (*_component.html.erb in app/components).

[
  "app/views/**/_*.html.erb",
  "app/components/**/_*.html.erb",
  "app/components/**/*_component.html.erb"
].freeze
VOID_ELEMENT_NAMES =
%w[
  area base br col embed hr img input link meta param source track wbr
].to_set.freeze

Instance Method Summary collapse

Constructor Details

#initialize(root:, output: $stdout, threshold: DEFAULT_THRESHOLD, ngram_size: DEFAULT_NGRAM_SIZE) ⇒ PartialSimilarity

Returns a new instance of PartialSimilarity.



23
24
25
26
27
28
# File 'lib/guardrails/partial_similarity.rb', line 23

def initialize(root:, output: $stdout, threshold: DEFAULT_THRESHOLD, ngram_size: DEFAULT_NGRAM_SIZE)
  @root = Pathname(root)
  @output = output
  @threshold = threshold
  @ngram_size = ngram_size
end

Instance Method Details

#compute_findingsObject



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/guardrails/partial_similarity.rb', line 36

def compute_findings
  partials = collect_partials.filter_map do |path|
    tokens = tokenize(File.read(path, encoding: Encoding::UTF_8))
    next nil if tokens.length < MIN_TAGS

    { path: path, tokens: tokens, ngrams: build_ngrams(tokens) }
  end

  findings = []
  partials.combination(2).each do |a, b|
    score = jaccard(a[:ngrams], b[:ngrams])
    next if score < @threshold

    findings << Finding.new(
      file_a: a[:path].relative_path_from(@root).to_s,
      file_b: b[:path].relative_path_from(@root).to_s,
      score: score,
      tag_count_a: a[:tokens].length,
      tag_count_b: b[:tokens].length
    )
  end
  findings.sort_by { |f| -f.score }
end

#element_tag_name(element) ⇒ Object



106
107
108
# File 'lib/guardrails/partial_similarity.rb', line 106

def element_tag_name(element)
  open_tag_name(element.open_tag) if element.respond_to?(:open_tag) && element.open_tag
end

#group_findings(findings) ⇒ Object

Group findings by connected component over the similarity graph. When N partials are pairwise above-threshold (e.g. 8 templated public_activity partials all matching each other at 1.00), the naive pair list emits C(N,2) lines that read as noise; collapsing to one group of N is what the user actually cares about.

Returns an Array of Hashes keyed by:

:files       — sorted Array of file paths in the component
:score_min, :score_max — observed score range across the
                         component's pairs
:pair_count  — how many original pairs fed into the group
:sample_pair — a representative Finding (the only one for size-2
               components, used to preserve the original pair
               line's tag-count detail)


124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/guardrails/partial_similarity.rb', line 124

def group_findings(findings)
  adj = Hash.new { |h, k| h[k] = Set.new }
  pairs_by_file = Hash.new { |h, k| h[k] = [] }
  findings.each do |f|
    adj[f.file_a] << f.file_b
    adj[f.file_b] << f.file_a
    pairs_by_file[f.file_a] << f
    pairs_by_file[f.file_b] << f
  end

  visited = Set.new
  groups = []
  adj.each_key do |file|
    next if visited.include?(file)

    component = Set.new
    stack = [file]
    until stack.empty?
      current = stack.pop
      next if component.include?(current)

      component << current
      visited << current
      adj[current].each { |neighbor| stack << neighbor unless component.include?(neighbor) }
    end

    # Walk only the findings touching files in this component (via the
    # pre-built index) — avoids the O(components × pairs) scan.
    seen_pair_ids = Set.new
    component_pairs = []
    component.each do |f|
      pairs_by_file[f].each do |pair|
        next unless component.include?(pair.file_a) && component.include?(pair.file_b)
        next if seen_pair_ids.include?(pair.object_id)

        seen_pair_ids << pair.object_id
        component_pairs << pair
      end
    end

    scores = component_pairs.map(&:score)
    groups << {
      files: component.to_a.sort,
      score_min: scores.min,
      score_max: scores.max,
      pair_count: component_pairs.size,
      sample_pair: component_pairs.first
    }
  end
  groups.sort_by { |g| -g[:files].size }
end

#open_tag_name(node) ⇒ Object



101
102
103
104
# File 'lib/guardrails/partial_similarity.rb', line 101

def open_tag_name(node)
  tok = node.respond_to?(:tag_name) ? node.tag_name : nil
  tok && tok.respond_to?(:value) ? tok.value.to_s.downcase : nil
end

#runObject



30
31
32
33
34
# File 'lib/guardrails/partial_similarity.rb', line 30

def run
  findings = compute_findings
  print_report(findings)
  findings
end

#tokenize(content) ⇒ Object

Tokenize a partial into a flat sequence of HTML tag names by walking the parsed AST. Traversal is open-tag → recurse-into-body → close-tag so the resulting sequence preserves source order:

<div><span></span></div>  →  ["div", "span", "span", "div"]

ERB nodes don’t contribute tokens. Void elements (img, input) produce one token; their close-tag pass is skipped.



68
69
70
71
72
73
# File 'lib/guardrails/partial_similarity.rb', line 68

def tokenize(content)
  tokens = []
  result = ErbParser.parse(content)
  walk_for_tokens(result.document, tokens)
  tokens
end

#void_element_name?(name) ⇒ Boolean

Returns:

  • (Boolean)


97
98
99
# File 'lib/guardrails/partial_similarity.rb', line 97

def void_element_name?(name)
  VOID_ELEMENT_NAMES.include?(name)
end

#walk_for_tokens(node, tokens) ⇒ Object



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/guardrails/partial_similarity.rb', line 75

def walk_for_tokens(node, tokens)
  case node
  when ::Herb::AST::HTMLElementNode
    name = element_tag_name(node)
    if name
      tokens << name
      Array(node.body).each { |child| walk_for_tokens(child, tokens) }
      tokens << name unless void_element_name?(name)
    end
  when ::Herb::AST::HTMLOpenTagNode
    # Top-level void element not wrapped in HTMLElementNode.
    name = open_tag_name(node)
    tokens << name if name && void_element_name?(name)
  else
    ErbParser.compact_children(node).each { |child| walk_for_tokens(child, tokens) }
  end
end