Module: Scrapetor::XPath::CssTranslator
- Defined in:
- lib/scrapetor/xpath.rb
Overview
CSS Translator: convert simple XPath AST shapes to CSS selectors so the heavily-optimised native CSS matcher answers them in one C call. Returns nil if the AST contains anything that doesn’t round-trip cleanly to CSS (boolean predicates, position()/last() functions, sibling/ancestor axes, etc.) — caller then falls back to the full evaluator.
Class Method Summary collapse
- .collect_pred_strs(preds) ⇒ Object
- .const_int(ast) ⇒ Object
- .extract_attr_name(ast) ⇒ Object
- .extract_string_literal(ast) ⇒ Object
- .flip_cmp(op) ⇒ Object
- .is_position_func?(ast) ⇒ Boolean
- .nth_for(op, n) ⇒ Object
- .quote_css(s) ⇒ Object
- .sibling_axis_tag(nt) ⇒ Object
-
.translate(ast) ⇒ Object
Returns { sel: “…”, kind: :nodes|:attr|:text } or nil.
-
.translate_predicate(ast) ⇒ Object
Convert a predicate AST to a CSS bracket / pseudo selector fragment.
Class Method Details
.collect_pred_strs(preds) ⇒ Object
760 761 762 763 764 765 766 767 768 |
# File 'lib/scrapetor/xpath.rb', line 760 def self.collect_pred_strs(preds) out = [] preds.each do |p| cs = translate_predicate(p) return nil if cs.nil? out << cs end out end |
.const_int(ast) ⇒ Object
774 775 776 777 778 |
# File 'lib/scrapetor/xpath.rb', line 774 def self.const_int(ast) return nil unless ast.is_a?(Hash) && ast[:t] == :num n = ast[:v] n.respond_to?(:to_i) ? n.to_i : nil end |
.extract_attr_name(ast) ⇒ Object
734 735 736 737 738 739 740 741 |
# File 'lib/scrapetor/xpath.rb', line 734 def self.extract_attr_name(ast) return nil unless ast.is_a?(Hash) && ast[:t] == :path steps = ast[:steps] return nil unless steps.length == 1 st = steps[0] return nil unless st[:axis] == :attribute && st[:nt].is_a?(Hash) && st[:nt][:name] st[:nt][:name] end |
.extract_string_literal(ast) ⇒ Object
743 744 745 746 |
# File 'lib/scrapetor/xpath.rb', line 743 def self.extract_string_literal(ast) return nil unless ast.is_a?(Hash) ast[:t] == :str ? ast[:v] : nil end |
.flip_cmp(op) ⇒ Object
780 781 782 783 784 785 786 787 788 789 |
# File 'lib/scrapetor/xpath.rb', line 780 def self.flip_cmp(op) # Flip the operator when operands are swapped case op when :lt then :gt when :le then :ge when :gt then :lt when :ge then :le else op end end |
.is_position_func?(ast) ⇒ Boolean
770 771 772 |
# File 'lib/scrapetor/xpath.rb', line 770 def self.is_position_func?(ast) ast.is_a?(Hash) && ast[:t] == :func && ast[:name] == "position" && ast[:args].empty? end |
.nth_for(op, n) ⇒ Object
791 792 793 794 795 796 797 798 799 800 801 802 803 804 |
# File 'lib/scrapetor/xpath.rb', line 791 def self.nth_for(op, n) case op when :eq then ":nth-of-type(#{n})" when :gt then ":nth-of-type(n+#{n + 1})" when :ge then ":nth-of-type(n+#{n})" when :lt # All positions strictly less than n. CSS `-n+N` matches 1..N. return nil if n <= 1 ":nth-of-type(-n+#{n - 1})" when :le return nil if n < 1 ":nth-of-type(-n+#{n})" end end |
.quote_css(s) ⇒ Object
748 749 750 |
# File 'lib/scrapetor/xpath.rb', line 748 def self.quote_css(s) s.match?(/[\s\[\]'"=]/) ? "\"#{s.gsub('"', '\\"')}\"" : "'#{s}'" end |
.sibling_axis_tag(nt) ⇒ Object
752 753 754 755 756 757 758 |
# File 'lib/scrapetor/xpath.rb', line 752 def self.sibling_axis_tag(nt) case nt when :any_element then "*" when Hash nt[:name] end end |
.translate(ast) ⇒ Object
Returns { sel: “…”, kind: :nodes|:attr|:text } or nil.
562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 |
# File 'lib/scrapetor/xpath.rb', line 562 def self.translate(ast) return nil unless ast.is_a?(Hash) && ast[:t] == :path steps = ast[:steps] return nil if steps.empty? # We support these path patterns: # absolute (/, //) and relative (.//, scoped from current node). # The leading `descendant-or-self any-element` step that // injects # gets collapsed with the next step: //tag becomes "tag", //a/b # becomes "a > b" only when an explicit child separator follows. idx = 0 css_parts = [] prev_was_descendant = ast[:absolute] # If absolute starts with a single `/`, the first real step is at # the document root child level → tighten with `> tag`. // (double_slash) # already injected a descendant-or-self step. while idx < steps.length st = steps[idx] axis = st[:axis] nt = st[:nt] preds = st[:preds] # `descendant-or-self node()` (from //) — combiner only. if axis == :descendant_or_self && nt == :node && preds.empty? prev_was_descendant = true idx += 1 next end # Tail extractions: @attr and text() must be the final step. last = idx == steps.length - 1 if axis == :attribute && nt.is_a?(Hash) && nt[:name] && last base = css_parts.join return nil if base.empty? return { sel: "#{base}::attr(#{nt[:name]})", kind: :attr } end # XPath text() returns one TextNode per literal text segment; # CSS `::text` concatenates a node's textContent. The # semantics diverge whenever an element has mixed text+inline # children, so we never route text() through CSS — the full # evaluator walks the arena and emits separate TextNodes per # text node id, which matches XPath / Nokogiri semantics. # Following-sibling axis: CSS `~` (general sibling) when the # name test is concrete, equivalently `* + tag` for the [1] # case (adjacent sibling). XPath following-sibling::name and # CSS `~ name` both select siblings of the context node that # come after it and match name, regardless of intervening # nodes — identical semantics. if axis == :following_sibling tag = sibling_axis_tag(nt) return nil unless tag pred_strs = collect_pred_strs(preds) return nil if pred_strs.nil? return nil if css_parts.empty? css_parts << " ~ " << tag pred_strs.each { |ps| css_parts << ps } prev_was_descendant = false idx += 1 next end # Only handle child axis for intermediate steps in the CSS path. return nil unless axis == :child # Node test must be a tag name or `*`. tag = case nt when :any_element then "*" when Hash return nil unless nt[:name] nt[:name] else return nil end # Predicates: translate each to CSS bracket / pseudo if possible. pred_strs = collect_pred_strs(preds) return nil if pred_strs.nil? if css_parts.empty? css_parts << tag elsif prev_was_descendant css_parts << " " << tag else css_parts << " > " << tag end pred_strs.each { |ps| css_parts << ps } prev_was_descendant = false idx += 1 end sel = css_parts.join return nil if sel.empty? { sel: sel, kind: :nodes } end |
.translate_predicate(ast) ⇒ Object
Convert a predicate AST to a CSS bracket / pseudo selector fragment. Returns nil if the predicate uses anything CSS can’t express (booleans, position()/last() functions, text() etc.).
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 |
# File 'lib/scrapetor/xpath.rb', line 664 def self.translate_predicate(ast) case ast[:t] when :path # @attr alone: a path with one step axis=:attribute, nt={name:...}. steps = ast[:steps] return nil unless steps.length == 1 st = steps[0] if st[:axis] == :attribute && st[:nt].is_a?(Hash) && st[:nt][:name] && st[:preds].empty? return "[#{st[:nt][:name]}]" end nil when :num # Positional predicate [N]: XPath `child::tag[N]` ≡ "Nth tag # child" which matches CSS `:nth-of-type(N)` exactly (both pick # the Nth member of the same-tag children of the parent). n = ast[:v].to_i return nil unless n >= 1 ":nth-of-type(#{n})" when :cmp # position() comparisons: XPath position() refers to the # context position within the parent's same-tag children # (matching CSS :nth-of-type semantics). These translate to # the corresponding :nth-of-type formulas: # position() = N → :nth-of-type(N) # position() > N → :nth-of-type(n+N+1) # position() >= N → :nth-of-type(n+N) # position() < N → :nth-of-type(-n+N-1) # position() <= N → :nth-of-type(-n+N) if is_position_func?(ast[:l]) && (n = const_int(ast[:r])) return nth_for(ast[:op], n) elsif is_position_func?(ast[:r]) && (n = const_int(ast[:l])) return nth_for(flip_cmp(ast[:op]), n) end return nil unless ast[:op] == :eq attr_name = extract_attr_name(ast[:l]) val_lit = extract_string_literal(ast[:r]) # try the other order if attr_name.nil? attr_name = extract_attr_name(ast[:r]) val_lit = extract_string_literal(ast[:l]) end return nil if attr_name.nil? || val_lit.nil? "[#{attr_name}=#{quote_css(val_lit)}]" when :and # Boolean AND of two simpler predicates → just concatenate the # CSS fragments (CSS treats `[a][b]` as logical AND). l = translate_predicate(ast[:l]) r = translate_predicate(ast[:r]) return nil if l.nil? || r.nil? "#{l}#{r}" when :func case ast[:name] when "contains" a = extract_attr_name(ast[:args][0]) v = extract_string_literal(ast[:args][1]) return nil unless a && v "[#{a}*=#{quote_css(v)}]" when "starts-with" a = extract_attr_name(ast[:args][0]) v = extract_string_literal(ast[:args][1]) return nil unless a && v "[#{a}^=#{quote_css(v)}]" else nil end else nil end end |