Class: Guardrails::PartialSimilarity
- Inherits:
-
Object
- Object
- Guardrails::PartialSimilarity
- Defined in:
- lib/guardrails/partial_similarity.rb
Defined Under Namespace
Classes: Finding
Constant Summary collapse
- DEFAULT_THRESHOLD =
0.7- DEFAULT_NGRAM_SIZE =
3- MIN_TAGS =
5- PARTIAL_PATTERNS =
Scan ERB partials (underscore-prefixed in app/views and app/components) AND ViewComponent sidecar templates (*_component.html.erb in app/components).
[ "app/views/**/_*.html.erb", "app/components/**/_*.html.erb", "app/components/**/*_component.html.erb" ].freeze
- VOID_ELEMENT_NAMES =
%w[ area base br col embed hr img input link meta param source track wbr ].to_set.freeze
Instance Method Summary collapse
- #compute_findings ⇒ Object
- #element_tag_name(element) ⇒ Object
-
#group_findings(findings) ⇒ Object
Group findings by connected component over the similarity graph.
-
#initialize(root:, output: $stdout, threshold: DEFAULT_THRESHOLD, ngram_size: DEFAULT_NGRAM_SIZE, style: nil) ⇒ PartialSimilarity
constructor
A new instance of PartialSimilarity.
- #open_tag_name(node) ⇒ Object
- #run ⇒ Object
-
#tokenize(content) ⇒ Object
Tokenize a partial into a flat sequence of HTML tag names by walking the parsed AST.
- #void_element_name?(name) ⇒ Boolean
- #walk_for_tokens(node, tokens) ⇒ Object
Constructor Details
#initialize(root:, output: $stdout, threshold: DEFAULT_THRESHOLD, ngram_size: DEFAULT_NGRAM_SIZE, style: nil) ⇒ PartialSimilarity
Returns a new instance of PartialSimilarity.
24 25 26 27 28 29 30 31 |
# File 'lib/guardrails/partial_similarity.rb', line 24 def initialize(root:, output: $stdout, threshold: DEFAULT_THRESHOLD, ngram_size: DEFAULT_NGRAM_SIZE, style: nil) @root = Pathname(root) @output = output @threshold = threshold @ngram_size = ngram_size @style = style || Report::Style.new(io: output) end |
Instance Method Details
#compute_findings ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/guardrails/partial_similarity.rb', line 39 def compute_findings partials = collect_partials.filter_map do |path| tokens = tokenize(File.read(path, encoding: Encoding::UTF_8)) next nil if tokens.length < MIN_TAGS { path: path, tokens: tokens, ngrams: build_ngrams(tokens) } end findings = [] partials.combination(2).each do |a, b| score = jaccard(a[:ngrams], b[:ngrams]) next if score < @threshold findings << Finding.new( file_a: a[:path].relative_path_from(@root).to_s, file_b: b[:path].relative_path_from(@root).to_s, score: score, tag_count_a: a[:tokens].length, tag_count_b: b[:tokens].length ) end findings.sort_by { |f| -f.score } end |
#element_tag_name(element) ⇒ Object
109 110 111 |
# File 'lib/guardrails/partial_similarity.rb', line 109 def element_tag_name(element) open_tag_name(element.open_tag) if element.respond_to?(:open_tag) && element.open_tag end |
#group_findings(findings) ⇒ Object
Group findings by connected component over the similarity graph. When N partials are pairwise above-threshold (e.g. 8 templated public_activity partials all matching each other at 1.00), the naive pair list emits C(N,2) lines that read as noise; collapsing to one group of N is what the user actually cares about.
Returns an Array of Hashes keyed by:
:files — sorted Array of file paths in the component
:score_min, :score_max — observed score range across the
component's pairs
:pair_count — how many original pairs fed into the group
:sample_pair — a representative Finding (the only one for size-2
components, used to preserve the original pair
line's tag-count detail)
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
# File 'lib/guardrails/partial_similarity.rb', line 127 def group_findings(findings) adj = Hash.new { |h, k| h[k] = Set.new } pairs_by_file = Hash.new { |h, k| h[k] = [] } findings.each do |f| adj[f.file_a] << f.file_b adj[f.file_b] << f.file_a pairs_by_file[f.file_a] << f pairs_by_file[f.file_b] << f end visited = Set.new groups = [] adj.each_key do |file| next if visited.include?(file) component = Set.new stack = [file] until stack.empty? current = stack.pop next if component.include?(current) component << current visited << current adj[current].each { |neighbor| stack << neighbor unless component.include?(neighbor) } end # Walk only the findings touching files in this component (via the # pre-built index) — avoids the O(components × pairs) scan. seen_pair_ids = Set.new component_pairs = [] component.each do |f| pairs_by_file[f].each do |pair| next unless component.include?(pair.file_a) && component.include?(pair.file_b) next if seen_pair_ids.include?(pair.object_id) seen_pair_ids << pair.object_id component_pairs << pair end end scores = component_pairs.map(&:score) groups << { files: component.to_a.sort, score_min: scores.min, score_max: scores.max, pair_count: component_pairs.size, sample_pair: component_pairs.first } end groups.sort_by { |g| -g[:files].size } end |
#open_tag_name(node) ⇒ Object
104 105 106 107 |
# File 'lib/guardrails/partial_similarity.rb', line 104 def open_tag_name(node) tok = node.respond_to?(:tag_name) ? node.tag_name : nil tok && tok.respond_to?(:value) ? tok.value.to_s.downcase : nil end |
#run ⇒ Object
33 34 35 36 37 |
# File 'lib/guardrails/partial_similarity.rb', line 33 def run findings = compute_findings print_report(findings) findings end |
#tokenize(content) ⇒ Object
Tokenize a partial into a flat sequence of HTML tag names by walking the parsed AST. Traversal is open-tag → recurse-into-body → close-tag so the resulting sequence preserves source order:
<div><span></span></div> → ["div", "span", "span", "div"]
ERB nodes don’t contribute tokens. Void elements (img, input) produce one token; their close-tag pass is skipped.
71 72 73 74 75 76 |
# File 'lib/guardrails/partial_similarity.rb', line 71 def tokenize(content) tokens = [] result = ErbParser.parse(content) walk_for_tokens(result.document, tokens) tokens end |
#void_element_name?(name) ⇒ Boolean
100 101 102 |
# File 'lib/guardrails/partial_similarity.rb', line 100 def void_element_name?(name) VOID_ELEMENT_NAMES.include?(name) end |
#walk_for_tokens(node, tokens) ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/guardrails/partial_similarity.rb', line 78 def walk_for_tokens(node, tokens) case node when ::Herb::AST::HTMLElementNode name = element_tag_name(node) if name tokens << name Array(node.body).each { |child| walk_for_tokens(child, tokens) } tokens << name unless void_element_name?(name) end when ::Herb::AST::HTMLOpenTagNode # Top-level void element not wrapped in HTMLElementNode. name = open_tag_name(node) tokens << name if name && void_element_name?(name) else ErbParser.compact_children(node).each { |child| walk_for_tokens(child, tokens) } end end |