Class: Scrapetor::XPath::Evaluator

Inherits:
Object
  • Object
show all
Defined in:
lib/scrapetor/xpath.rb

Overview

Evaluator

Instance Method Summary collapse

Constructor Details

#initialize(context) ⇒ Evaluator

Returns a new instance of Evaluator.



812
813
814
815
816
# File 'lib/scrapetor/xpath.rb', line 812

def initialize(context)
  @document = context.is_a?(Scrapetor::Document) ? context : context.document
  @native_doc, @native_root_id = native_handles_for(context)
  @context_input = context
end

Instance Method Details

#ancestors_of(n) ⇒ Object



1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
# File 'lib/scrapetor/xpath.rb', line 1135

def ancestors_of(n)
  nd, rid, wrapper = arena_handle_for(n)
  if nd
    nd.node_ancestor_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
  elsif n.is_a?(Scrapetor::Node)
    list = []
    cur = n.parent
    while cur
      list << cur
      cur = cur.parent
    end
    list.reverse
  else
    []
  end
end

#apply_predicates(base, preds) ⇒ Object



1000
1001
1002
1003
# File 'lib/scrapetor/xpath.rb', line 1000

def apply_predicates(base, preds)
  nodes = base.is_a?(Array) ? base : [base]
  apply_step_predicates(nodes, preds)
end

#apply_step_predicates(nodes, preds) ⇒ Object



979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
# File 'lib/scrapetor/xpath.rb', line 979

def apply_step_predicates(nodes, preds)
  preds.each do |pred_ast|
    filtered = []
    total = nodes.length
    nodes.each_with_index do |n, idx|
      ctx_info = { position: idx + 1, last: total }
      r = eval_expr(pred_ast, [n], ctx_info)
      keep =
        if r.is_a?(Numeric)
          # Numeric predicate: positional
          r.to_i == idx + 1
        else
          xpath_boolean(r)
        end
      filtered << n if keep
    end
    nodes = filtered
  end
  nodes
end

#arena_handle_for(n) ⇒ Object

Returns [native_doc, node_id, wrapper] when the node lives in the arena, nil otherwise. Handles all three native carriers: Scrapetor::Node wrapping a Native::Element, the Native::DocumentWrapper itself (root context), and a raw Native::Element.



1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
# File 'lib/scrapetor/xpath.rb', line 1051

def arena_handle_for(n)
  if n.is_a?(Scrapetor::Node)
    bk = n.backing_node
    if bk.respond_to?(:id) && bk.respond_to?(:doc) && bk.doc.respond_to?(:node_following_sibling_ids)
      return [bk.doc, bk.id, (bk.respond_to?(:wrapper) ? bk.wrapper : nil)]
    end
  elsif n.is_a?(Scrapetor::Native::DocumentWrapper)
    return [n.native, 0, n]
  elsif n.respond_to?(:id) && n.respond_to?(:doc) && n.doc.respond_to?(:node_following_sibling_ids)
    # raw Native::Element
    return [n.doc, n.id, (n.respond_to?(:wrapper) ? n.wrapper : nil)]
  end
  nil
end

#arg_first_node(args, context_set, position_info) ⇒ Object



1597
1598
1599
1600
# File 'lib/scrapetor/xpath.rb', line 1597

def arg_first_node(args, context_set, position_info)
  v = args.empty? ? context_set : eval_expr(args[0], context_set, position_info)
  v.is_a?(Array) ? v.first : v
end

#call_function(name, args, context_set, position_info) ⇒ Object

—- Functions —————————————————-



1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
# File 'lib/scrapetor/xpath.rb', line 1473

def call_function(name, args, context_set, position_info)
  case name
  # node-set
  when "last"
    (position_info && position_info[:last]) || context_set.length
  when "position"
    (position_info && position_info[:position]) || 1
  when "count"
    v = eval_expr(args[0], context_set, position_info)
    v.is_a?(Array) ? v.length : 0
  when "id"
    # id('foo') — return element with that id from the document.
    v = eval_expr(args[0], context_set, position_info)
    ids = v.is_a?(Array) ? v.map { |x| xpath_string(x) }.flat_map { |s| s.split(/\s+/) } : xpath_string(v).split(/\s+/)
    out = []
    ids.each do |id_str|
      hit = @document.at_css("##{id_str}") rescue nil
      out << hit if hit
    end
    out
  when "local-name"
    n = arg_first_node(args, context_set, position_info)
    n && n.respond_to?(:name) ? n.name.split(":").last.to_s : ""
  when "name"
    n = arg_first_node(args, context_set, position_info)
    n && n.respond_to?(:name) ? n.name.to_s : ""
  when "namespace-uri"
    ""  # we don't model namespaces in HTML
  # string
  when "string"
    xpath_string(args.empty? ? context_set.first : eval_expr(args[0], context_set, position_info))
  when "concat"
    args.map { |a| xpath_string(eval_expr(a, context_set, position_info)) }.join
  when "starts-with"
    xpath_string(eval_expr(args[0], context_set, position_info))
      .start_with?(xpath_string(eval_expr(args[1], context_set, position_info)))
  when "contains"
    xpath_string(eval_expr(args[0], context_set, position_info))
      .include?(xpath_string(eval_expr(args[1], context_set, position_info)))
  when "substring-before"
    a = xpath_string(eval_expr(args[0], context_set, position_info))
    b = xpath_string(eval_expr(args[1], context_set, position_info))
    idx = a.index(b)
    idx ? a[0...idx] : ""
  when "substring-after"
    a = xpath_string(eval_expr(args[0], context_set, position_info))
    b = xpath_string(eval_expr(args[1], context_set, position_info))
    idx = a.index(b)
    idx ? a[(idx + b.length)..] || "" : ""
  when "substring"
    s = xpath_string(eval_expr(args[0], context_set, position_info))
    start = xpath_number(eval_expr(args[1], context_set, position_info))
    # XPath substring is 1-based, rounding to nearest integer.
    start_i = start.respond_to?(:round) ? start.round.to_i : start.to_i
    if args.size > 2
      len = xpath_number(eval_expr(args[2], context_set, position_info))
      len_i = len.respond_to?(:round) ? len.round.to_i : len.to_i
      from = [start_i, 1].max
      to   = start_i + len_i
      from_i = from - 1
      to_i   = [to - 1, s.length].min
      s[from_i...to_i] || ""
    else
      from = [start_i, 1].max
      s[(from - 1)..] || ""
    end
  when "string-length"
    s = args.empty? ?
          xpath_string(context_set.first) :
          xpath_string(eval_expr(args[0], context_set, position_info))
    s.length
  when "normalize-space"
    s = args.empty? ?
          xpath_string(context_set.first) :
          xpath_string(eval_expr(args[0], context_set, position_info))
    s.strip.gsub(/\s+/, " ")
  when "translate"
    s    = xpath_string(eval_expr(args[0], context_set, position_info))
    from = xpath_string(eval_expr(args[1], context_set, position_info))
    to   = xpath_string(eval_expr(args[2], context_set, position_info))
    # Per XPath: characters in `from` are replaced by the same-index
    # char in `to`; characters in `from` past `to`'s length are deleted.
    map = {}
    from.each_char.with_index { |c, i| map[c] = i < to.length ? to[i] : nil }
    s.chars.map { |c| map.key?(c) ? map[c] : c }.compact.join
  # boolean
  when "boolean" then xpath_boolean(eval_expr(args[0], context_set, position_info))
  when "not"     then !xpath_boolean(eval_expr(args[0], context_set, position_info))
  when "true"    then true
  when "false"   then false
  when "lang"
    # lang('en') — true if context node's xml:lang ancestor-or-self
    # starts with 'en' (case-insensitive). HTML: also `lang` attr.
    target = xpath_string(eval_expr(args[0], context_set, position_info)).downcase
    n = context_set.first
    n = n.is_a?(Array) ? n.first : n
    while n
      lang = nil
      if n.respond_to?(:[])
        lang = (n["xml:lang"] || n["lang"]) rescue nil
      end
      return true if lang && (lang.downcase == target || lang.downcase.start_with?("#{target}-"))
      n = parent_of(n)
    end
    false
  # number
  when "number"
    xpath_number(args.empty? ? context_set.first : eval_expr(args[0], context_set, position_info))
  when "sum"
    v = eval_expr(args[0], context_set, position_info)
    v = [v] unless v.is_a?(Array)
    v.inject(0.0) { |acc, x| acc + xpath_number(x).to_f }
  when "floor"
    xpath_number(eval_expr(args[0], context_set, position_info)).floor
  when "ceiling"
    xpath_number(eval_expr(args[0], context_set, position_info)).ceil
  when "round"
    n = xpath_number(eval_expr(args[0], context_set, position_info))
    n.is_a?(Float) && n.nan? ? n : n.round
  else
    raise UnsupportedError, "unknown XPath function `#{name}()`"
  end
end

#collect_attributes(n, nt, out) ⇒ Object



1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
# File 'lib/scrapetor/xpath.rb', line 1198

def collect_attributes(n, nt, out)
  if native_node?(n)
    attrs = n.doc.node_attributes(n.id)
    attrs.each do |name, val|
      case nt
      when :any_element, :any_node, :node
        out << val
      when Hash
        if nt[:name].nil? || nt[:name] == "*" || name.casecmp(nt[:name]).zero?
          out << val
        end
      end
    end
  elsif n.is_a?(Scrapetor::Node)
    attrs = n.attributes
    attrs.each do |name, val|
      case nt
      when Hash
        if nt[:name].nil? || nt[:name] == "*" || name.casecmp(nt[:name]).zero?
          out << val
        end
      else
        out << val
      end
    end
  end
end

#collect_children(n, nt, out) ⇒ Object



1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
# File 'lib/scrapetor/xpath.rb', line 1029

def collect_children(n, nt, out)
  nd, rid, wrapper = arena_handle_for(n)
  if nd
    nd.node_children(rid).each do |cid|
      type = nd.node_type(cid)
      wrapped = wrap_native_typed_with(nd, cid, type, wrapper: wrapper)
      push_if_matches(wrapped, nt, out) if wrapped
    end
    return
  end
  if n.is_a?(Scrapetor::Document)
    n.backing.respond_to?(:children) ? n.backing.children.each { |c| push_if_matches(wrap_dom(c), nt, out) } : nil
  elsif n.is_a?(Scrapetor::Node)
    n.backing_node.children.each { |c| push_if_matches(wrap_dom(c), nt, out) }
  end
end

#collect_descendant_ids(nd, rid, nt, out, wrapper) ⇒ Object



1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
# File 'lib/scrapetor/xpath.rb', line 1091

def collect_descendant_ids(nd, rid, nt, out, wrapper)
  # Range walk: ids (rid+1 .. dfs_out(rid)] are descendants. Filter
  # by node test and push wrapped results. Skips non-elements
  # unless the test wants them.
  # For DocumentWrapper rid=0, we want to enumerate all descendants
  # which means everything in the arena from id 1 up.
  size = nd.size
  lo = rid + 1
  hi = size - 1
  # We can fall back to a generic stack walk if needed, but ids
  # are pre-order in the unmutated case, so the range walk is exact.
  # node_type call avoids loading nodes we'll skip immediately.
  case nt
  when :any_element, :node
    (lo..hi).each do |k|
      t = nd.node_type(k)
      next unless t == 1 || (nt == :node && (t == 1 || t == 3 || t == 8))
      out << wrap_native_typed_with(nd, k, t, wrapper: wrapper)
    end
  when :text
    (lo..hi).each do |k|
      t = nd.node_type(k)
      next unless t == 3
      out << wrap_native_typed_with(nd, k, t, wrapper: wrapper)
    end
  when :comment
    (lo..hi).each do |k|
      t = nd.node_type(k)
      next unless t == 8
      out << wrap_native_typed_with(nd, k, t, wrapper: wrapper)
    end
  when Hash
    if (name = nt[:name])
      target = name.downcase
      (lo..hi).each do |k|
        next unless nd.node_type(k) == 1
        n = nd.node_name(k)
        next unless n.casecmp(target).zero?
        out << wrap_native_typed_with(nd, k, 1, wrapper: wrapper)
      end
    end
  end
end

#collect_descendants(n, nt, out) ⇒ Object



1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
# File 'lib/scrapetor/xpath.rb', line 1066

def collect_descendants(n, nt, out)
  nd, rid, wrapper = arena_handle_for(n)
  if nd
    # Comment-specific fast path: dedicated C primitive.
    if nt == :comment
      nd.node_descendant_comment_ids(rid).each { |cid|
        out << wrap_native_typed_with(nd, cid, 8, wrapper: wrapper)
      }
      return
    end
    collect_descendant_ids(nd, rid, nt, out, wrapper)
    return
  end
  if n.respond_to?(:children)
    stack = n.children.to_a.reverse
    while (c = stack.pop)
      push_if_matches(c.is_a?(Scrapetor::Node) ? c : wrap_dom(c), nt, out)
      if c.respond_to?(:children)
        kids = c.children.to_a
        stack.concat(kids.reverse)
      end
    end
  end
end

#collect_self(n, nt, out) ⇒ Object



1025
1026
1027
# File 'lib/scrapetor/xpath.rb', line 1025

def collect_self(n, nt, out)
  push_if_matches(n, nt, out)
end

#compare_node_sets(op, a, b) ⇒ Object



1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
# File 'lib/scrapetor/xpath.rb', line 1435

def compare_node_sets(op, a, b)
  case op
  when :eq, :neq
    # Stringify each side; check if any pair matches under XPath rules.
    a.each do |x|
      sx = xpath_string(x)
      b.each do |y|
        sy = xpath_string(y)
        hit = sx == sy
        return op == :eq if hit && op == :eq
        return op == :neq if !hit && op == :neq
      end
    end
    op == :neq && a.empty? && b.empty? ? false : (op == :neq ? a.any? { |x| b.any? { |y| xpath_string(x) != xpath_string(y) } } : false)
  else
    # Numeric: any pair satisfies the comparison.
    a.each do |x|
      nx = xpath_number(x)
      next if nx.is_a?(Float) && nx.nan?
      b.each do |y|
        ny = xpath_number(y)
        next if ny.is_a?(Float) && ny.nan?
        ok =
          case op
          when :lt then nx <  ny
          when :le then nx <= ny
          when :gt then nx >  ny
          when :ge then nx >= ny
          end
        return true if ok
      end
    end
    false
  end
end

#dedupe_node_set(nodes) ⇒ Object



966
967
968
969
970
971
972
973
974
975
976
977
# File 'lib/scrapetor/xpath.rb', line 966

def dedupe_node_set(nodes)
  return nodes if nodes.length < 2
  seen = {}
  out = []
  nodes.each do |n|
    key = node_identity(n)
    next if seen[key]
    seen[key] = true
    out << n
  end
  out
end

#do_compare(op, l, r) ⇒ Object

—- Comparison rules (XPath 1.0 §3.4) ————————–



1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
# File 'lib/scrapetor/xpath.rb', line 1400

def do_compare(op, l, r)
  # If either operand is a node-set, the comparison is true if any
  # node satisfies the condition against the other operand.
  if l.is_a?(Array) || r.is_a?(Array)
    a = l.is_a?(Array) ? l : [l]
    b = r.is_a?(Array) ? r : [r]
    return compare_node_sets(op, a, b)
  end
  case op
  when :eq, :neq
    # If neither is a node-set, type coercion:
    # - if either is boolean → both booleans
    # - else if either is number → both numbers
    # - else → both strings
    if l.is_a?(TrueClass) || l.is_a?(FalseClass) ||
       r.is_a?(TrueClass) || r.is_a?(FalseClass)
      res = xpath_boolean(l) == xpath_boolean(r)
    elsif l.is_a?(Numeric) || r.is_a?(Numeric)
      res = xpath_number(l) == xpath_number(r)
    else
      res = xpath_string(l) == xpath_string(r)
    end
    op == :eq ? res : !res
  else
    ln = xpath_number(l); rn = xpath_number(r)
    return false if (ln.is_a?(Float) && ln.nan?) || (rn.is_a?(Float) && rn.nan?)
    case op
    when :lt then ln <  rn
    when :le then ln <= rn
    when :gt then ln >  rn
    when :ge then ln >= rn
    end
  end
end

#eval_expr(ast, context_set, position_info) ⇒ Object

eval_expr returns one of:

Array<Node|String>   (node-set or string-set for /@x and /text())
String / Numeric / TrueClass / FalseClass / NilClass (scalar)


839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
# File 'lib/scrapetor/xpath.rb', line 839

def eval_expr(ast, context_set, position_info)
  case ast[:t]
  when :path
    eval_path(ast, context_set, position_info)
  when :filter
    base = eval_expr(ast[:primary], context_set, position_info)
    apply_predicates(base, ast[:preds])
  when :filter_path
    base = eval_expr(ast[:primary], context_set, position_info)
    eval_steps_against(base, ast[:steps])
  when :union
    out = []
    seen = {}
    ast[:ops].each do |op|
      r = eval_expr(op, context_set, position_info)
      r = [r] unless r.is_a?(Array)
      r.each do |n|
        key = node_identity(n)
        next if seen[key]
        seen[key] = true
        out << n
      end
    end
    out
  when :or
    xpath_boolean(eval_expr(ast[:l], context_set, position_info)) ||
      xpath_boolean(eval_expr(ast[:r], context_set, position_info))
  when :and
    xpath_boolean(eval_expr(ast[:l], context_set, position_info)) &&
      xpath_boolean(eval_expr(ast[:r], context_set, position_info))
  when :cmp
    do_compare(ast[:op],
               eval_expr(ast[:l], context_set, position_info),
               eval_expr(ast[:r], context_set, position_info))
  when :add
    l = xpath_number(eval_expr(ast[:l], context_set, position_info))
    r = xpath_number(eval_expr(ast[:r], context_set, position_info))
    return Float::NAN if l.respond_to?(:nan?) && (l.nan? || r.nan?)
    ast[:op] == :plus ? (l + r) : (l - r)
  when :mul
    l = xpath_number(eval_expr(ast[:l], context_set, position_info))
    r = xpath_number(eval_expr(ast[:r], context_set, position_info))
    case ast[:op]
    when :mul then l * r
    when :div
      r.zero? ? (l.zero? ? Float::NAN : (l.positive? ? Float::INFINITY : -Float::INFINITY)) : (l.to_f / r.to_f)
    when :mod
      r.zero? ? Float::NAN : (l - (l.to_i / r.to_i) * r)
    end
  when :neg
    -xpath_number(eval_expr(ast[:e], context_set, position_info))
  when :num
    ast[:v]
  when :str
    ast[:v]
  when :func
    call_function(ast[:name], ast[:args], context_set, position_info)
  else
    raise UnsupportedError, "unknown AST node: #{ast[:t]}"
  end
end

#eval_path(ast, context_set, position_info) ⇒ Object



901
902
903
904
905
906
907
908
909
# File 'lib/scrapetor/xpath.rb', line 901

def eval_path(ast, context_set, position_info)
  nodes =
    if ast[:absolute]
      [ root_for_context ]
    else
      context_set
    end
  eval_steps_against(nodes, ast[:steps])
end

#eval_program(ast) ⇒ Object



818
819
820
821
822
# File 'lib/scrapetor/xpath.rb', line 818

def eval_program(ast)
  result = eval_expr(ast, [ initial_context_node ], nil)
  # Flatten singleton arrays produced by terminal extractions.
  result.is_a?(Array) ? result : [result]
end

#eval_steps_against(nodes, steps) ⇒ Object



911
912
913
914
915
916
917
918
# File 'lib/scrapetor/xpath.rb', line 911

def eval_steps_against(nodes, steps)
  current = nodes
  steps.each do |st|
    current = step_walk(current, st)
    current = apply_step_predicates(current, st[:preds]) unless st[:preds].empty?
  end
  current
end

#following_of(n) ⇒ Object



1186
1187
1188
1189
1190
# File 'lib/scrapetor/xpath.rb', line 1186

def following_of(n)
  nd, rid, wrapper = arena_handle_for(n)
  return [] unless nd
  nd.node_following_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
end

#following_siblings_of(n) ⇒ Object



1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
# File 'lib/scrapetor/xpath.rb', line 1152

def following_siblings_of(n)
  nd, rid, wrapper = arena_handle_for(n)
  if nd
    nd.node_following_sibling_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
  elsif n.is_a?(Scrapetor::Node)
    out = []
    cur = n.next_sibling
    while cur
      out << cur if cur.respond_to?(:element?) && cur.element?
      cur = cur.respond_to?(:next_sibling) ? cur.next_sibling : nil
    end
    out
  else
    []
  end
end

#initial_context_nodeObject



824
825
826
827
828
829
830
831
832
833
834
# File 'lib/scrapetor/xpath.rb', line 824

def initial_context_node
  # For Document inputs: the context is the document wrapper (so we
  # can descend into its children via the arena). For Node inputs:
  # the context is the node itself.
  @initial =
    if @context_input.is_a?(Scrapetor::Document)
      @context_input.backing
    else
      @context_input
    end
end

#matches_node_test?(n, nt) ⇒ Boolean

Returns:

  • (Boolean)


1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
# File 'lib/scrapetor/xpath.rb', line 1231

def matches_node_test?(n, nt)
  case nt
  when :any_element
    n.respond_to?(:element?) ? n.element? : (n.respond_to?(:name) && !n.name.start_with?("#"))
  when :text
    n.respond_to?(:text?) && n.text?
  when :comment
    n.respond_to?(:comment?) && n.comment?
  when :node
    true
  when Hash
    return false unless n.respond_to?(:name)
    target = nt[:name]
    return false if target.nil?
    return true if target == "*"
    name = n.name
    return false if name.nil? || name.start_with?("#")
    name.casecmp(target).zero?
  else
    false
  end
end

#native_handles_for(context) ⇒ Object

—- Native wrapping helpers ————————————–



1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
# File 'lib/scrapetor/xpath.rb', line 1256

def native_handles_for(context)
  if context.is_a?(Scrapetor::Document)
    bk = context.backing
    if defined?(Scrapetor::Native::DocumentWrapper) && bk.is_a?(Scrapetor::Native::DocumentWrapper) &&
       bk.native.respond_to?(:node_following_sibling_ids)
      return [bk.native, 0]
    end
  elsif context.is_a?(Scrapetor::Node)
    bk = context.backing_node
    if bk.respond_to?(:id) && bk.respond_to?(:doc) && bk.doc.respond_to?(:node_following_sibling_ids)
      return [bk.doc, bk.id]
    end
  end
  [nil, nil]
end

#native_node?(n) ⇒ Boolean

Returns:

  • (Boolean)


1272
1273
1274
1275
# File 'lib/scrapetor/xpath.rb', line 1272

def native_node?(n)
  return false unless n
  n.respond_to?(:id) && n.respond_to?(:doc) && n.doc.respond_to?(:node_following_sibling_ids)
end

#native_wrapper_for(n) ⇒ Object



1277
1278
1279
1280
1281
# File 'lib/scrapetor/xpath.rb', line 1277

def native_wrapper_for(n)
  return n if n.is_a?(Scrapetor::Native::DocumentWrapper)
  return n.wrapper if n.respond_to?(:wrapper)
  nil
end

#node_identity(n) ⇒ Object



1322
1323
1324
1325
1326
1327
1328
1329
# File 'lib/scrapetor/xpath.rb', line 1322

def node_identity(n)
  if n.is_a?(Scrapetor::Node)
    bk = n.backing_node
    bk.respond_to?(:id) ? [:nat, bk.respond_to?(:doc) ? bk.doc.object_id : nil, bk.id] : bk.object_id
  else
    n.object_id
  end
end

#parent_of(n) ⇒ Object



1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
# File 'lib/scrapetor/xpath.rb', line 1011

def parent_of(n)
  return nil if n.nil?
  if native_node?(n)
    pid = n.doc.node_parent(n.id)
    pid ? wrap_native(pid) : @document
  elsif n.is_a?(Scrapetor::Native::DocumentWrapper) || n.is_a?(Scrapetor::Document)
    nil
  elsif n.is_a?(Scrapetor::Node)
    n.parent
  elsif n.respond_to?(:parent)
    n.parent
  end
end

#preceding_of(n) ⇒ Object



1192
1193
1194
1195
1196
# File 'lib/scrapetor/xpath.rb', line 1192

def preceding_of(n)
  nd, rid, wrapper = arena_handle_for(n)
  return [] unless nd
  nd.node_preceding_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
end

#preceding_siblings_of(n) ⇒ Object



1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
# File 'lib/scrapetor/xpath.rb', line 1169

def preceding_siblings_of(n)
  nd, rid, wrapper = arena_handle_for(n)
  if nd
    nd.node_preceding_sibling_ids(rid).map { |i| wrap_native_typed_with(nd, i, 1, wrapper: wrapper) }
  elsif n.is_a?(Scrapetor::Node)
    out = []
    cur = n.previous_sibling
    while cur
      out.unshift(cur) if cur.respond_to?(:element?) && cur.element?
      cur = cur.respond_to?(:previous_sibling) ? cur.previous_sibling : nil
    end
    out
  else
    []
  end
end

#push_if_matches(n, nt, out) ⇒ Object



1226
1227
1228
1229
# File 'lib/scrapetor/xpath.rb', line 1226

def push_if_matches(n, nt, out)
  return unless matches_node_test?(n, nt)
  out << n
end

#root_for_contextObject

—- Node identity / wrapping ————————————



1007
1008
1009
# File 'lib/scrapetor/xpath.rb', line 1007

def root_for_context
  @document.backing
end

#step_walk(current, st) ⇒ Object



920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
# File 'lib/scrapetor/xpath.rb', line 920

def step_walk(current, st)
  axis = st[:axis]
  nt = st[:nt]
  out = []
  current.each do |n|
    case axis
    when :child
      collect_children(n, nt, out)
    when :descendant
      collect_descendants(n, nt, out)
    when :descendant_or_self
      collect_self(n, nt, out)
      collect_descendants(n, nt, out)
    when :parent
      p = parent_of(n)
      push_if_matches(p, nt, out) if p
    when :self
      collect_self(n, nt, out)
    when :ancestor
      ancestors_of(n).each { |a| push_if_matches(a, nt, out) }
    when :ancestor_or_self
      ancestors_of(n).each { |a| push_if_matches(a, nt, out) }
      collect_self(n, nt, out)
    when :following_sibling
      following_siblings_of(n).each { |s| push_if_matches(s, nt, out) }
    when :preceding_sibling
      preceding_siblings_of(n).each { |s| push_if_matches(s, nt, out) }
    when :following
      following_of(n).each { |s| push_if_matches(s, nt, out) }
    when :preceding
      preceding_of(n).each { |s| push_if_matches(s, nt, out) }
    when :attribute
      collect_attributes(n, nt, out)
    when :namespace
      # No-op: we don't model namespace nodes.
    end
  end
  # Per XPath 1.0 §2.1: every axis step produces a node-set
  # (i.e. duplicate-free, document-ordered). When the input
  # context set has multiple nodes, the axis walks can produce
  # overlapping results — e.g. //dt/following-sibling::dd from
  # 50 sibling dts each emits a long suffix of overlapping dds.
  # Deduplicate by node identity so callers see set semantics.
  dedupe_node_set(out)
end

#wrap_dom(node) ⇒ Object



1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
# File 'lib/scrapetor/xpath.rb', line 1309

def wrap_dom(node)
  return node if node.is_a?(Scrapetor::Node) || node.is_a?(Scrapetor::CommentNode)
  if node.respond_to?(:comment?) && node.comment?
    Scrapetor::CommentNode.new(@document, node.respond_to?(:content) ? node.content : node.to_s)
  elsif node.respond_to?(:text?) && node.text?
    node.respond_to?(:content) ? node.content : node.to_s
  elsif node.respond_to?(:element?) && node.element?
    Scrapetor::Node.new(@document, node)
  else
    node
  end
end

#wrap_native(id) ⇒ Object



1283
1284
1285
1286
# File 'lib/scrapetor/xpath.rb', line 1283

def wrap_native(id)
  return nil if @native_doc.nil? || id.nil?
  wrap_native_typed(id, @native_doc.node_type(id))
end

#wrap_native_typed(id, type) ⇒ Object



1288
1289
1290
# File 'lib/scrapetor/xpath.rb', line 1288

def wrap_native_typed(id, type)
  wrap_native_typed_with(@native_doc, id, type, wrapper: @initial.respond_to?(:wrapper) ? @initial.wrapper : nil)
end

#wrap_native_typed_with(nd, id, type, wrapper: nil) ⇒ Object



1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
# File 'lib/scrapetor/xpath.rb', line 1292

def wrap_native_typed_with(nd, id, type, wrapper: nil)
  case type
  when 1
    Scrapetor::Node.new(@document, Scrapetor::Native::Element.new(nd, id, wrapper))
  when 8
    Scrapetor::CommentNode.new(@document, nd.node_comment_text(id))
  when 3
    # text node — use TextNode (String subclass that responds to
    # text?, name, etc.) so XPath predicates against text-node
    # sets behave like Nokogiri's.
    Scrapetor::TextNode.new(nd.node_text(id))
  else
    # doc / unknown
    nil
  end
end

#xpath_boolean(v) ⇒ Object

—- XPath type coercions —————————————-



1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
# File 'lib/scrapetor/xpath.rb', line 1333

def xpath_boolean(v)
  case v
  when nil          then false
  when true, false  then v
  when Numeric      then !(v.zero? || (v.respond_to?(:nan?) && v.nan?))
  when String       then !v.empty?
  when Array        then !v.empty?
  else true
  end
end

#xpath_number(v) ⇒ Object



1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
# File 'lib/scrapetor/xpath.rb', line 1377

def xpath_number(v)
  case v
  when nil      then Float::NAN
  when Numeric  then v
  when true     then 1
  when false    then 0
  when String
    s = v.strip
    return Float::NAN if s.empty?
    if s =~ /\A-?\d+\.?\d*\z/ || s =~ /\A-?\.\d+\z/
      s.include?(".") ? s.to_f : s.to_i
    else
      Float::NAN
    end
  when Array
    xpath_number(xpath_string(v))
  else
    xpath_number(xpath_string(v))
  end
end

#xpath_string(v) ⇒ Object



1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
# File 'lib/scrapetor/xpath.rb', line 1344

def xpath_string(v)
  case v
  when nil            then ""
  when String         then v
  when true           then "true"
  when false          then "false"
  when Float
    if v.nan?            then "NaN"
    elsif v.infinite?    then v.positive? ? "Infinity" : "-Infinity"
    elsif v == v.to_i    then v.to_i.to_s
    else v.to_s
    end
  when Numeric        then v.to_s
  when Array
    n = v.first
    xpath_string_for_node(n)
  else
    xpath_string_for_node(v)
  end
end

#xpath_string_for_node(n) ⇒ Object



1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
# File 'lib/scrapetor/xpath.rb', line 1365

def xpath_string_for_node(n)
  return "" if n.nil?
  return n if n.is_a?(String)
  if n.respond_to?(:text)
    n.text.to_s
  elsif n.respond_to?(:to_s)
    n.to_s
  else
    ""
  end
end