Class: Canon::Comparison::HtmlComparator

Inherits:
MarkupComparator show all
Defined in:
lib/canon/comparison/html_comparator.rb

Overview

HTML comparison class Handles comparison of HTML nodes with various options

Inherits shared comparison functionality from MarkupComparator.

Constant Summary collapse

DEFAULT_OPTS =

Default comparison options for HTML

{
  # Structural filtering options
  ignore_children: false,
  ignore_text_nodes: false,
  ignore_attr_content: [],
  ignore_attrs: [],
  ignore_attrs_by_name: [],
  ignore_nodes: [],

  # Output options
  verbose: false,
  diff_children: false,

  # Match system options
  match_profile: nil,
  match: nil,
  preprocessing: nil,
  global_profile: nil,
  global_options: nil,

  # Diff display options
  diff: nil,
}.freeze

Class Method Summary collapse

Methods inherited from MarkupComparator

add_difference, build_attribute_difference_reason, build_difference_reason, build_path_for_node, build_text_difference_reason, comment_node?, determine_node_dimension, enrich_diff_metadata, extract_attributes, extract_text_content_from_node, filter_children, node_excluded?, node_text, same_node_type?, serialize_element_node, serialize_node, text_node?, truncate_text, whitespace_only_difference?

Class Method Details

.equivalent?(html1, html2, opts = {}, child_opts = {}) ⇒ Boolean, Array

Compare two HTML nodes for equivalence

Parameters:

  • html1 (String, Nokogiri::HTML::Document)

    First HTML

  • html2 (String, Nokogiri::HTML::Document)

    Second HTML

  • opts (Hash) (defaults to: {})

    Comparison options

  • child_opts (Hash) (defaults to: {})

    Options for child comparison

Returns:

  • (Boolean, Array)

    true if equivalent, or array of diffs if verbose



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# File 'lib/canon/comparison/html_comparator.rb', line 60

def equivalent?(html1, html2, opts = {}, child_opts = {})
  opts = DEFAULT_OPTS.merge(opts)

  # Resolve match options with format-specific defaults
  match_opts_hash = MatchOptions::Xml.resolve(
    format: :html,
    match_profile: opts[:match_profile],
    match: opts[:match],
    preprocessing: opts[:preprocessing],
    global_profile: opts[:global_profile],
    global_options: opts[:global_options],
  )

  # Parse nodes to detect HTML version before creating profile
  # We need to parse early to know if we're dealing with HTML4 or HTML5
  node1 = parse_node(html1, match_opts_hash[:preprocessing],
                     match_opts_hash)
  node2 = parse_node(html2, match_opts_hash[:preprocessing],
                     match_opts_hash)

  # Detect HTML version from parsed nodes
  html_version = detect_html_version_from_node(node1)

  # Create HTML-specific compare profile
  compare_profile = HtmlCompareProfile.new(
    match_opts_hash,
    html_version: html_version,
  )

  # Wrap in ResolvedMatchOptions for DiffClassifier
  match_opts = Canon::Comparison::ResolvedMatchOptions.new(
    match_opts_hash,
    format: :html,
    compare_profile: compare_profile,
  )

  # Store resolved match options hash for use in comparison logic
  opts[:match_opts] = match_opts_hash

  # Use tree diff if semantic_diff option is enabled
  if match_opts.semantic_diff?
    return perform_semantic_tree_diff(html1, html2, opts,
                                      match_opts_hash)
  end

  # Create child_opts with resolved options
  child_opts = opts.merge(child_opts)

  # Serialize preprocessed nodes for diff display (avoid re-preprocessing)
  preprocessed_str1 = serialize_for_display(node1)
  preprocessed_str2 = serialize_for_display(node2)

  differences = []
  diff_children = opts[:diff_children] || false

  # DocumentFragment nodes need special handling - compare their children
  # instead of the fragment nodes themselves
  # This is a SAFETY CHECK for legacy cases where Nokogiri nodes might still be used
  # The main path (parse_node) now returns Canon::Xml::Nodes::RootNode, so this
  # check should rarely trigger, but we keep it for robustness
  if (node1.is_a?(Nokogiri::HTML4::DocumentFragment) ||
      node1.is_a?(Nokogiri::XML::DocumentFragment)) &&
      (node2.is_a?(Nokogiri::HTML4::DocumentFragment) ||
      node2.is_a?(Nokogiri::XML::DocumentFragment))
    # Compare children of fragments - filter them first
    all_children1 = node1.children.to_a
    all_children2 = node2.children.to_a

    # Filter children based on match options (e.g., ignore comments)
    children1 = XmlNodeComparison.filter_children(all_children1, opts)
    children2 = XmlNodeComparison.filter_children(all_children2, opts)

    if children1.length != children2.length
      result = Comparison::UNEQUAL_ELEMENTS
    elsif children1.empty?
      result = Comparison::EQUIVALENT
    else
      # Compare each pair of children
      result = Comparison::EQUIVALENT
      children1.zip(children2).each do |child1, child2|
        child_result = XmlNodeComparison.compare_nodes(child1, child2,
                                                       opts, child_opts,
                                                       diff_children,
                                                       differences)
        if child_result != Comparison::EQUIVALENT
          result = child_result
          break
        end
      end
    end
  else
    result = XmlNodeComparison.compare_nodes(node1, node2, opts,
                                             child_opts, diff_children,
                                             differences)
  end

  # Classify DiffNodes as normative/informative if we have verbose output
  if opts[:verbose] && !differences.empty?
    classifier = Canon::Diff::DiffClassifier.new(match_opts)
    classifier.classify_all(differences.select do |d|
      d.is_a?(Canon::Diff::DiffNode)
    end)
  end

  if opts[:verbose]
    ComparisonResult.new(
      differences: differences,
      preprocessed_strings: [preprocessed_str1, preprocessed_str2],
      format: :html,
      html_version: detect_html_version_from_node(node1),
      match_options: match_opts_hash,
      algorithm: :dom,
    )
  elsif result != Comparison::EQUIVALENT && !differences.empty?
    # Non-verbose mode: check equivalence
    # If comparison found differences, classify them to determine if normative
    classifier = Canon::Diff::DiffClassifier.new(match_opts)
    classifier.classify_all(differences.select do |d|
      d.is_a?(Canon::Diff::DiffNode)
    end)
    # Equivalent if no normative differences (matches semantic algorithm)
    differences.none?(&:normative?)
  else
    # Either equivalent or no differences tracked
    result == Comparison::EQUIVALENT
  end
end