Module: Scrapetor::XPath::CssTranslator

Defined in:
lib/scrapetor/xpath.rb

Overview

CSS Translator: convert simple XPath AST shapes to CSS selectors so the heavily-optimised native CSS matcher answers them in one C call. Returns nil if the AST contains anything that doesn’t round-trip cleanly to CSS (boolean predicates, position()/last() functions, sibling/ancestor axes, etc.) — caller then falls back to the full evaluator.

Class Method Summary collapse

Class Method Details

.collect_pred_strs(preds) ⇒ Object



760
761
762
763
764
765
766
767
768
# File 'lib/scrapetor/xpath.rb', line 760

def self.collect_pred_strs(preds)
  out = []
  preds.each do |p|
    cs = translate_predicate(p)
    return nil if cs.nil?
    out << cs
  end
  out
end

.const_int(ast) ⇒ Object



774
775
776
777
778
# File 'lib/scrapetor/xpath.rb', line 774

def self.const_int(ast)
  return nil unless ast.is_a?(Hash) && ast[:t] == :num
  n = ast[:v]
  n.respond_to?(:to_i) ? n.to_i : nil
end

.extract_attr_name(ast) ⇒ Object



734
735
736
737
738
739
740
741
# File 'lib/scrapetor/xpath.rb', line 734

def self.extract_attr_name(ast)
  return nil unless ast.is_a?(Hash) && ast[:t] == :path
  steps = ast[:steps]
  return nil unless steps.length == 1
  st = steps[0]
  return nil unless st[:axis] == :attribute && st[:nt].is_a?(Hash) && st[:nt][:name]
  st[:nt][:name]
end

.extract_string_literal(ast) ⇒ Object



743
744
745
746
# File 'lib/scrapetor/xpath.rb', line 743

def self.extract_string_literal(ast)
  return nil unless ast.is_a?(Hash)
  ast[:t] == :str ? ast[:v] : nil
end

.flip_cmp(op) ⇒ Object



780
781
782
783
784
785
786
787
788
789
# File 'lib/scrapetor/xpath.rb', line 780

def self.flip_cmp(op)
  # Flip the operator when operands are swapped
  case op
  when :lt then :gt
  when :le then :ge
  when :gt then :lt
  when :ge then :le
  else op
  end
end

.is_position_func?(ast) ⇒ Boolean

Returns:

  • (Boolean)


770
771
772
# File 'lib/scrapetor/xpath.rb', line 770

def self.is_position_func?(ast)
  ast.is_a?(Hash) && ast[:t] == :func && ast[:name] == "position" && ast[:args].empty?
end

.nth_for(op, n) ⇒ Object



791
792
793
794
795
796
797
798
799
800
801
802
803
804
# File 'lib/scrapetor/xpath.rb', line 791

def self.nth_for(op, n)
  case op
  when :eq then ":nth-of-type(#{n})"
  when :gt then ":nth-of-type(n+#{n + 1})"
  when :ge then ":nth-of-type(n+#{n})"
  when :lt
    # All positions strictly less than n. CSS `-n+N` matches 1..N.
    return nil if n <= 1
    ":nth-of-type(-n+#{n - 1})"
  when :le
    return nil if n < 1
    ":nth-of-type(-n+#{n})"
  end
end

.quote_css(s) ⇒ Object



748
749
750
# File 'lib/scrapetor/xpath.rb', line 748

def self.quote_css(s)
  s.match?(/[\s\[\]'"=]/) ? "\"#{s.gsub('"', '\\"')}\"" : "'#{s}'"
end

.sibling_axis_tag(nt) ⇒ Object



752
753
754
755
756
757
758
# File 'lib/scrapetor/xpath.rb', line 752

def self.sibling_axis_tag(nt)
  case nt
  when :any_element then "*"
  when Hash
    nt[:name]
  end
end

.translate(ast) ⇒ Object

Returns { sel: “…”, kind: :nodes|:attr|:text } or nil.



562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
# File 'lib/scrapetor/xpath.rb', line 562

def self.translate(ast)
  return nil unless ast.is_a?(Hash) && ast[:t] == :path
  steps = ast[:steps]
  return nil if steps.empty?

  # We support these path patterns:
  #   absolute (/, //) and relative (.//, scoped from current node).
  # The leading `descendant-or-self any-element` step that // injects
  # gets collapsed with the next step: //tag becomes "tag", //a/b
  # becomes "a > b" only when an explicit child separator follows.
  idx = 0
  css_parts = []
  prev_was_descendant = ast[:absolute]
  # If absolute starts with a single `/`, the first real step is at
  # the document root child level → tighten with `> tag`. // (double_slash)
  # already injected a descendant-or-self step.

  while idx < steps.length
    st = steps[idx]
    axis = st[:axis]
    nt = st[:nt]
    preds = st[:preds]

    # `descendant-or-self node()` (from //) — combiner only.
    if axis == :descendant_or_self && nt == :node && preds.empty?
      prev_was_descendant = true
      idx += 1
      next
    end

    # Tail extractions: @attr and text() must be the final step.
    last = idx == steps.length - 1
    if axis == :attribute && nt.is_a?(Hash) && nt[:name] && last
      base = css_parts.join
      return nil if base.empty?
      return { sel: "#{base}::attr(#{nt[:name]})", kind: :attr }
    end
    # XPath text() returns one TextNode per literal text segment;
    # CSS `::text` concatenates a node's textContent. The
    # semantics diverge whenever an element has mixed text+inline
    # children, so we never route text() through CSS — the full
    # evaluator walks the arena and emits separate TextNodes per
    # text node id, which matches XPath / Nokogiri semantics.

    # Following-sibling axis: CSS `~` (general sibling) when the
    # name test is concrete, equivalently `* + tag` for the [1]
    # case (adjacent sibling). XPath following-sibling::name and
    # CSS `~ name` both select siblings of the context node that
    # come after it and match name, regardless of intervening
    # nodes — identical semantics.
    if axis == :following_sibling
      tag = sibling_axis_tag(nt)
      return nil unless tag
      pred_strs = collect_pred_strs(preds)
      return nil if pred_strs.nil?
      return nil if css_parts.empty?
      css_parts << " ~ " << tag
      pred_strs.each { |ps| css_parts << ps }
      prev_was_descendant = false
      idx += 1
      next
    end

    # Only handle child axis for intermediate steps in the CSS path.
    return nil unless axis == :child

    # Node test must be a tag name or `*`.
    tag =
      case nt
      when :any_element then "*"
      when Hash
        return nil unless nt[:name]
        nt[:name]
      else
        return nil
      end

    # Predicates: translate each to CSS bracket / pseudo if possible.
    pred_strs = collect_pred_strs(preds)
    return nil if pred_strs.nil?

    if css_parts.empty?
      css_parts << tag
    elsif prev_was_descendant
      css_parts << " " << tag
    else
      css_parts << " > " << tag
    end
    pred_strs.each { |ps| css_parts << ps }

    prev_was_descendant = false
    idx += 1
  end

  sel = css_parts.join
  return nil if sel.empty?
  { sel: sel, kind: :nodes }
end

.translate_predicate(ast) ⇒ Object

Convert a predicate AST to a CSS bracket / pseudo selector fragment. Returns nil if the predicate uses anything CSS can’t express (booleans, position()/last() functions, text() etc.).



664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
# File 'lib/scrapetor/xpath.rb', line 664

def self.translate_predicate(ast)
  case ast[:t]
  when :path
    # @attr alone: a path with one step axis=:attribute, nt={name:...}.
    steps = ast[:steps]
    return nil unless steps.length == 1
    st = steps[0]
    if st[:axis] == :attribute && st[:nt].is_a?(Hash) && st[:nt][:name] && st[:preds].empty?
      return "[#{st[:nt][:name]}]"
    end
    nil
  when :num
    # Positional predicate [N]: XPath `child::tag[N]` ≡ "Nth tag
    # child" which matches CSS `:nth-of-type(N)` exactly (both pick
    # the Nth member of the same-tag children of the parent).
    n = ast[:v].to_i
    return nil unless n >= 1
    ":nth-of-type(#{n})"
  when :cmp
    # position() comparisons: XPath position() refers to the
    # context position within the parent's same-tag children
    # (matching CSS :nth-of-type semantics). These translate to
    # the corresponding :nth-of-type formulas:
    #   position() = N    → :nth-of-type(N)
    #   position() > N    → :nth-of-type(n+N+1)
    #   position() >= N   → :nth-of-type(n+N)
    #   position() < N    → :nth-of-type(-n+N-1)
    #   position() <= N   → :nth-of-type(-n+N)
    if is_position_func?(ast[:l]) && (n = const_int(ast[:r]))
      return nth_for(ast[:op], n)
    elsif is_position_func?(ast[:r]) && (n = const_int(ast[:l]))
      return nth_for(flip_cmp(ast[:op]), n)
    end
    return nil unless ast[:op] == :eq
    attr_name = extract_attr_name(ast[:l])
    val_lit   = extract_string_literal(ast[:r])
    # try the other order
    if attr_name.nil?
      attr_name = extract_attr_name(ast[:r])
      val_lit   = extract_string_literal(ast[:l])
    end
    return nil if attr_name.nil? || val_lit.nil?
    "[#{attr_name}=#{quote_css(val_lit)}]"
  when :and
    # Boolean AND of two simpler predicates → just concatenate the
    # CSS fragments (CSS treats `[a][b]` as logical AND).
    l = translate_predicate(ast[:l])
    r = translate_predicate(ast[:r])
    return nil if l.nil? || r.nil?
    "#{l}#{r}"
  when :func
    case ast[:name]
    when "contains"
      a = extract_attr_name(ast[:args][0])
      v = extract_string_literal(ast[:args][1])
      return nil unless a && v
      "[#{a}*=#{quote_css(v)}]"
    when "starts-with"
      a = extract_attr_name(ast[:args][0])
      v = extract_string_literal(ast[:args][1])
      return nil unless a && v
      "[#{a}^=#{quote_css(v)}]"
    else
      nil
    end
  else
    nil
  end
end