Class: Guardrails::PartialSimilarity
- Inherits:
-
Object
- Object
- Guardrails::PartialSimilarity
- Defined in:
- lib/guardrails/partial_similarity.rb
Defined Under Namespace
Classes: Finding
Constant Summary collapse
- DEFAULT_THRESHOLD =
0.7- DEFAULT_NGRAM_SIZE =
3- MIN_TAGS =
5- PARTIAL_PATTERNS =
Scan ERB partials (underscore-prefixed in app/views and app/components) AND ViewComponent sidecar templates (*_component.html.erb in app/components).
[ "app/views/**/_*.html.erb", "app/components/**/_*.html.erb", "app/components/**/*_component.html.erb" ].freeze
- VOID_ELEMENT_NAMES =
%w[ area base br col embed hr img input link meta param source track wbr ].to_set.freeze
Instance Method Summary collapse
- #compute_findings ⇒ Object
- #element_tag_name(element) ⇒ Object
-
#group_findings(findings) ⇒ Object
Group findings by connected component over the similarity graph.
-
#initialize(root:, output: $stdout, threshold: DEFAULT_THRESHOLD, ngram_size: DEFAULT_NGRAM_SIZE) ⇒ PartialSimilarity
constructor
A new instance of PartialSimilarity.
- #open_tag_name(node) ⇒ Object
- #run ⇒ Object
-
#tokenize(content) ⇒ Object
Tokenize a partial into a flat sequence of HTML tag names by walking the parsed AST.
- #void_element_name?(name) ⇒ Boolean
- #walk_for_tokens(node, tokens) ⇒ Object
Constructor Details
#initialize(root:, output: $stdout, threshold: DEFAULT_THRESHOLD, ngram_size: DEFAULT_NGRAM_SIZE) ⇒ PartialSimilarity
Returns a new instance of PartialSimilarity.
23 24 25 26 27 28 |
# File 'lib/guardrails/partial_similarity.rb', line 23 def initialize(root:, output: $stdout, threshold: DEFAULT_THRESHOLD, ngram_size: DEFAULT_NGRAM_SIZE) @root = Pathname(root) @output = output @threshold = threshold @ngram_size = ngram_size end |
Instance Method Details
#compute_findings ⇒ Object
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/guardrails/partial_similarity.rb', line 36 def compute_findings partials = collect_partials.filter_map do |path| tokens = tokenize(File.read(path, encoding: Encoding::UTF_8)) next nil if tokens.length < MIN_TAGS { path: path, tokens: tokens, ngrams: build_ngrams(tokens) } end findings = [] partials.combination(2).each do |a, b| score = jaccard(a[:ngrams], b[:ngrams]) next if score < @threshold findings << Finding.new( file_a: a[:path].relative_path_from(@root).to_s, file_b: b[:path].relative_path_from(@root).to_s, score: score, tag_count_a: a[:tokens].length, tag_count_b: b[:tokens].length ) end findings.sort_by { |f| -f.score } end |
#element_tag_name(element) ⇒ Object
106 107 108 |
# File 'lib/guardrails/partial_similarity.rb', line 106 def element_tag_name(element) open_tag_name(element.open_tag) if element.respond_to?(:open_tag) && element.open_tag end |
#group_findings(findings) ⇒ Object
Group findings by connected component over the similarity graph. When N partials are pairwise above-threshold (e.g. 8 templated public_activity partials all matching each other at 1.00), the naive pair list emits C(N,2) lines that read as noise; collapsing to one group of N is what the user actually cares about.
Returns an Array of Hashes keyed by:
:files — sorted Array of file paths in the component
:score_min, :score_max — observed score range across the
component's pairs
:pair_count — how many original pairs fed into the group
:sample_pair — a representative Finding (the only one for size-2
components, used to preserve the original pair
line's tag-count detail)
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# File 'lib/guardrails/partial_similarity.rb', line 124 def group_findings(findings) adj = Hash.new { |h, k| h[k] = Set.new } pairs_by_file = Hash.new { |h, k| h[k] = [] } findings.each do |f| adj[f.file_a] << f.file_b adj[f.file_b] << f.file_a pairs_by_file[f.file_a] << f pairs_by_file[f.file_b] << f end visited = Set.new groups = [] adj.each_key do |file| next if visited.include?(file) component = Set.new stack = [file] until stack.empty? current = stack.pop next if component.include?(current) component << current visited << current adj[current].each { |neighbor| stack << neighbor unless component.include?(neighbor) } end # Walk only the findings touching files in this component (via the # pre-built index) — avoids the O(components × pairs) scan. seen_pair_ids = Set.new component_pairs = [] component.each do |f| pairs_by_file[f].each do |pair| next unless component.include?(pair.file_a) && component.include?(pair.file_b) next if seen_pair_ids.include?(pair.object_id) seen_pair_ids << pair.object_id component_pairs << pair end end scores = component_pairs.map(&:score) groups << { files: component.to_a.sort, score_min: scores.min, score_max: scores.max, pair_count: component_pairs.size, sample_pair: component_pairs.first } end groups.sort_by { |g| -g[:files].size } end |
#open_tag_name(node) ⇒ Object
101 102 103 104 |
# File 'lib/guardrails/partial_similarity.rb', line 101 def open_tag_name(node) tok = node.respond_to?(:tag_name) ? node.tag_name : nil tok && tok.respond_to?(:value) ? tok.value.to_s.downcase : nil end |
#run ⇒ Object
30 31 32 33 34 |
# File 'lib/guardrails/partial_similarity.rb', line 30 def run findings = compute_findings print_report(findings) findings end |
#tokenize(content) ⇒ Object
Tokenize a partial into a flat sequence of HTML tag names by walking the parsed AST. Traversal is open-tag → recurse-into-body → close-tag so the resulting sequence preserves source order:
<div><span></span></div> → ["div", "span", "span", "div"]
ERB nodes don’t contribute tokens. Void elements (img, input) produce one token; their close-tag pass is skipped.
68 69 70 71 72 73 |
# File 'lib/guardrails/partial_similarity.rb', line 68 def tokenize(content) tokens = [] result = ErbParser.parse(content) walk_for_tokens(result.document, tokens) tokens end |
#void_element_name?(name) ⇒ Boolean
97 98 99 |
# File 'lib/guardrails/partial_similarity.rb', line 97 def void_element_name?(name) VOID_ELEMENT_NAMES.include?(name) end |
#walk_for_tokens(node, tokens) ⇒ Object
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/guardrails/partial_similarity.rb', line 75 def walk_for_tokens(node, tokens) case node when ::Herb::AST::HTMLElementNode name = element_tag_name(node) if name tokens << name Array(node.body).each { |child| walk_for_tokens(child, tokens) } tokens << name unless void_element_name?(name) end when ::Herb::AST::HTMLOpenTagNode # Top-level void element not wrapped in HTMLElementNode. name = open_tag_name(node) tokens << name if name && void_element_name?(name) else ErbParser.compact_children(node).each { |child| walk_for_tokens(child, tokens) } end end |