Class: Canon::Comparison::HtmlComparator

Inherits:
MarkupComparator show all
Defined in:
lib/canon/comparison/html_comparator.rb

Overview

HTML comparison class Handles comparison of HTML nodes with various options

Inherits shared comparison functionality from MarkupComparator.

Constant Summary collapse

DEFAULT_OPTS =

Default comparison options for HTML

{
  # Structural filtering options
  ignore_children: false,
  ignore_text_nodes: false,
  ignore_attr_content: [],
  ignore_attrs: [],
  ignore_attrs_by_name: [],
  ignore_nodes: [],

  # Output options
  verbose: false,
  diff_children: false,

  # Match system options
  match_profile: nil,
  match: nil,
  preprocessing: nil,
  global_profile: nil,
  global_options: nil,

  # Diff display options
  diff: nil,
}.freeze

Class Method Summary collapse

Methods inherited from MarkupComparator

add_difference, build_attribute_difference_reason, build_difference_reason, build_path_for_node, build_text_difference_reason, comment_node?, determine_node_dimension, enrich_diff_metadata, extract_attributes, extract_text_content_from_node, filter_children, node_excluded?, node_text, same_node_type?, serialize_element_node, serialize_node, text_node?, truncate_text, whitespace_only_difference?

Class Method Details

.equivalent?(html1, html2, opts = {}, child_opts = {}) ⇒ Boolean, Array

Compare two HTML nodes for equivalence

Parameters:

  • html1 (String, Nokogiri::HTML::Document)

    First HTML

  • html2 (String, Nokogiri::HTML::Document)

    Second HTML

  • opts (Hash) (defaults to: {})

    Comparison options

  • child_opts (Hash) (defaults to: {})

    Options for child comparison

Returns:

  • (Boolean, Array)

    true if equivalent, or array of diffs if verbose



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/canon/comparison/html_comparator.rb', line 60

def equivalent?(html1, html2, opts = {}, child_opts = {})
  opts = DEFAULT_OPTS.merge(opts)

  # Capture original HTML strings for display.
  # Prefer the true originals preserved by dom_diff (before
  # HtmlParser.parse mutated the DOM), falling back to
  # extract_original_string for callers that bypass dom_diff.
  original_str1 = opts.delete(:_original_str1) ||
    extract_original_string(html1)
  original_str2 = opts.delete(:_original_str2) ||
    extract_original_string(html2)

  # Resolve match options with format-specific defaults
  match_opts_hash = MatchOptions::Xml.resolve(
    format: :html,
    match_profile: opts[:match_profile],
    match: opts[:match],
    preprocessing: opts[:preprocessing],
    global_profile: opts[:global_profile],
    global_options: opts[:global_options],
  )

  # Parse nodes to detect HTML version before creating profile
  # We need to parse early to know if we're dealing with HTML4 or HTML5
  node1 = parse_node(html1, match_opts_hash[:preprocessing],
                     match_opts_hash)
  node2 = parse_node(html2, match_opts_hash[:preprocessing],
                     match_opts_hash)

  # Detect HTML version from parsed nodes
  html_version = detect_html_version_from_node(node1)

  # Create HTML-specific compare profile
  compare_profile = HtmlCompareProfile.new(
    match_opts_hash,
    html_version: html_version,
  )

  # Wrap in ResolvedMatchOptions for DiffClassifier
  match_opts = Canon::Comparison::ResolvedMatchOptions.new(
    match_opts_hash,
    format: :html,
    compare_profile: compare_profile,
  )

  # Store resolved match options hash for use in comparison logic
  opts[:match_opts] = match_opts_hash

  # Use tree diff if semantic_diff option is enabled
  if match_opts.semantic_diff?
    return perform_semantic_tree_diff(html1, html2, opts,
                                      match_opts_hash)
  end

  # Create child_opts with resolved options
  child_opts = opts.merge(child_opts)

  # Serialize preprocessed nodes for diff display (avoid re-preprocessing)
  preprocessed_str1 = serialize_for_display(node1)
  preprocessed_str2 = serialize_for_display(node2)

  differences = []
  diff_children = opts[:diff_children] || false

  # DocumentFragment nodes need special handling - compare their children
  # instead of the fragment nodes themselves
  # This is a SAFETY CHECK for legacy cases where Nokogiri nodes might still be used
  # The main path (parse_node) now returns Canon::Xml::Nodes::RootNode, so this
  # check should rarely trigger, but we keep it for robustness
  result = if fragment_nodes?(node1, node2)
             compare_fragment_children(node1, node2, opts, child_opts,
                                       diff_children, differences)
           else
             XmlNodeComparison.compare_nodes(node1, node2, opts,
                                             child_opts, diff_children,
                                             differences)
           end

  # Classify DiffNodes as normative/informative if we have verbose output
  if opts[:verbose] && !differences.empty?
    classifier = Canon::Diff::DiffClassifier.new(match_opts)
    classifier.classify_all(differences.grep(Canon::Diff::DiffNode))
  end

  if opts[:verbose]
    ComparisonResult.new(
      differences: differences,
      preprocessed_strings: [preprocessed_str1, preprocessed_str2],
      original_strings: [original_str1, original_str2],
      format: :html,
      html_version: detect_html_version_from_node(node1),
      match_options: match_opts_hash,
      algorithm: :dom,
    )
  elsif result != Comparison::EQUIVALENT && !differences.empty?
    # Non-verbose mode: check equivalence
    # If comparison found differences, classify them to determine if normative
    classifier = Canon::Diff::DiffClassifier.new(match_opts)
    classifier.classify_all(differences.grep(Canon::Diff::DiffNode))
    # Equivalent if no normative differences (matches semantic algorithm)
    differences.none?(&:normative?)
  else
    # Either equivalent or no differences tracked
    result == Comparison::EQUIVALENT
  end
end