Module: SmarterJSON::Recovery

Includes:
Bytes
Defined in:
lib/smarter_json/parser.rb

Constant Summary

Constants included from Bytes

Bytes::BACKSLASH, Bytes::COLON, Bytes::COMMA, Bytes::CR, Bytes::DOLLAR, Bytes::DOT, Bytes::DQUOTE, Bytes::HASH, Bytes::LBRACE, Bytes::LBRACKET, Bytes::LF, Bytes::LOWER_E, Bytes::LOWER_F, Bytes::LOWER_N, Bytes::LOWER_T, Bytes::LOWER_U, Bytes::LOWER_X, Bytes::MINUS, Bytes::NINE, Bytes::PLUS, Bytes::RBRACE, Bytes::RBRACKET, Bytes::SLASH, Bytes::SPACE, Bytes::SQUOTE, Bytes::STAR, Bytes::TAB, Bytes::UNDERSCORE, Bytes::UPPER_E, Bytes::UPPER_F, Bytes::UPPER_I, Bytes::UPPER_N, Bytes::UPPER_T, Bytes::UPPER_X, Bytes::ZERO

Class Method Summary collapse

Class Method Details

.candidate_ranges(input) ⇒ Object



846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
# File 'lib/smarter_json/parser.rb', line 846

def candidate_ranges(input)
  ranges = []
  stack = []
  start_pos = nil
  i = 0
  mode = nil
  while i < input.bytesize
    b = input.getbyte(i)
    if mode == :double
      if b == BACKSLASH
        i += 2
        next
      elsif b == DQUOTE
        mode = nil
      end
      i += 1
      next
    elsif mode == :single
      if b == BACKSLASH
        i += 2
        next
      elsif b == SQUOTE
        mode = nil
      end
      i += 1
      next
    elsif mode == :triple
      if input.byteslice(i, 3) == "'''"
        mode = nil
        i += 3
      else
        i += 1
      end
      next
    elsif mode == :line_comment
      if [LF, CR].include?(b)
        mode = nil
      else
        i += 1
        next
      end
    elsif mode == :block_comment
      if input.byteslice(i, 2) == "*/"
        mode = nil
        i += 2
      else
        i += 1
      end
      next
    else
      if input.byteslice(i, 2) == "//"
        mode = :line_comment
        i += 2
        next
      elsif input.byteslice(i, 2) == "/*"
        mode = :block_comment
        i += 2
        next
      elsif b == HASH
        mode = :line_comment
        i += 1
        next
      elsif b == DQUOTE
        mode = :double
        i += 1
        next
      elsif input.byteslice(i, 3) == "'''"
        mode = :triple
        i += 3
        next
      elsif b == SQUOTE
        mode = :single
        i += 1
        next
      elsif [LBRACE, LBRACKET].include?(b)
        start_pos = i if stack.empty?
        stack << b
      elsif b == RBRACE
        stack.pop if stack.last == LBRACE
        if stack.empty? && start_pos
          ranges << (start_pos...(i + 1))
          start_pos = nil
        end
      elsif b == RBRACKET
        stack.pop if stack.last == LBRACKET
        if stack.empty? && start_pos
          ranges << (start_pos...(i + 1))
          start_pos = nil
        end
      end
    end
    i += 1
  end
  ranges
end

.emit_wrapper_warnings(payloads, handler) ⇒ Object



751
752
753
754
755
756
757
758
759
# File 'lib/smarter_json/parser.rb', line 751

def emit_wrapper_warnings(payloads, handler)
  return unless handler

  meta = payloads.first[:meta]
  warn(handler, :prefix_text_ignored, "ignored non-JSON text before the payload", *meta[:first_pos]) if meta[:prefix]
  warn(handler, :code_fence_stripped, "stripped markdown code fences around the payload", *meta[:first_pos]) if meta[:fence]
  warn(handler, :wrapper_tag_stripped, "stripped wrapper tags around the payload", *meta[:first_pos]) if meta[:wrapper]
  warn(handler, :suffix_text_ignored, "ignored non-JSON text after the payload", *meta[:last_pos]) if meta[:suffix]
end

.extract_payloads(input, options) ⇒ Object



761
762
763
764
765
766
767
768
769
770
771
772
773
774
# File 'lib/smarter_json/parser.rb', line 761

def extract_payloads(input, options)
  payloads = candidate_ranges(input).filter_map do |range|
    slice = input.byteslice(range.begin, range.end - range.begin)
    begin
      SmarterJSON.send(:process_content, slice, options.merge(on_warning: nil))
      { slice: slice, range: range }
    rescue ParseError
      nil
    end
  end
  meta = wrapper_meta(input, payloads.map { |p| p[:range] })
  payloads.each { |payload| payload[:meta] = meta }
  payloads
end

.leading_label?(input) ⇒ Boolean

Whether the input opens with a bare "JSON:" / "Final answer:" label (which would otherwise parse, wrongly, as an implicit-root object keyed by the label). We use String#start_with? with a Regexp rather than match?(/\A.../): start_with? checks only the beginning, whereas a \A-anchored match? still retries at every byte position and so scans the WHOLE input (≈0.3s on a 200 MB document) on every parse. (Caller has already established the input is valid_encoding?.)

Returns:

  • (Boolean)


724
725
726
# File 'lib/smarter_json/parser.rb', line 724

def leading_label?(input)
  input.start_with?(/[[:space:]]*(?:JSON|Final answer)[[:space:]]*:/i)
end

.line_col_for(input, offset) ⇒ Object



808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
# File 'lib/smarter_json/parser.rb', line 808

def line_col_for(input, offset)
  line = 1
  col = 1
  i = 0
  while i < offset
    b = input.getbyte(i)
    break if b.nil?

    if b == LF
      line += 1
      col = 1
      i += 1
    elsif b == CR
      line += 1
      col = 1
      i += 1
      i += 1 if i < offset && input.getbyte(i) == LF
    else
      col += 1
      i += 1
    end
  end
  [line, col]
end

.non_payload_text(input, ranges) ⇒ Object



797
798
799
800
801
802
803
804
805
806
# File 'lib/smarter_json/parser.rb', line 797

def non_payload_text(input, ranges)
  out = +""
  pos = 0
  ranges.each do |range|
    out << input.byteslice(pos, range.begin - pos) if range.begin > pos
    pos = range.end
  end
  out << input.byteslice(pos, input.bytesize - pos) if pos < input.bytesize
  out
end

.process_string(input, options, &block) ⇒ Object



672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
# File 'lib/smarter_json/parser.rb', line 672

def process_string(input, options, &block)
  input = SmarterJSON.send(:normalize_default_encoding, input, options)

  # UTF-16 / UTF-32 / Shift_JIS / ... cannot be byte-scanned for JSON structure. Parse
  # a UTF-8 copy and emit each document's strings back in the encoding the bytes arrived
  # in — the caller always gets values in the encoding they handed us, never UTF-8.
  if (target_enc = SmarterJSON.send(:unscannable_encoding, input))
    target_enc = SmarterJSON.send(:concrete_unicode_encoding, input, target_enc) # avoid per-string BOMs
    opts = options.merge(encoding: nil) # the working copy is UTF-8; don't re-label it downstream
    utf8 = SmarterJSON.send(:to_utf8_copy, input) # invalid bytes -> SmarterJSON::EncodingError
    replace = options[:replace_char]
    if block
      return process_string(utf8, opts) { |doc| block.call(SmarterJSON.send(:deep_encode, doc, target_enc, replace)) }
    end

    return process_string(utf8, opts).map { |doc| SmarterJSON.send(:deep_encode, doc, target_enc, replace) }
  end

  return SmarterJSON.send(:process_content, input, options, &block) unless input.valid_encoding?

  # Recovery is REACTIVE: parse first, and only fall back to wrapper extraction when
  # the parse actually fails (the rescue below). Every wrapper shape — code fences,
  # <json>/BEGIN_JSON tags, prose around the payload — makes the parse raise, so the
  # rescue catches it. Crucially this keeps clean input on the single-parse fast path
  # even when its string values legitimately contain ``` or <json> (real-world data
  # like GitHub event payloads is full of markdown), instead of dragging hundreds of
  # MB through the pure-Ruby candidate scan.
  #
  # The one exception is a bare leading label like "JSON: {...}", which parses
  # successfully but WRONGLY (as an implicit-root object keyed by the label), so it
  # must be intercepted before parsing.
  if leading_label?(input)
    payloads = extract_payloads(input, options)
    return replay_payloads(payloads, options, &block) unless payloads.empty?
  end

  SmarterJSON.send(:process_content, input, options, &block)
rescue ParseError => e
  raise if e.is_a?(EncodingError)

  payloads = extract_payloads(input, options)
  return replay_payloads(payloads, options, &block) unless payloads.empty?

  raise
end

.replay_payloads(payloads, options, &block) ⇒ Object



728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
# File 'lib/smarter_json/parser.rb', line 728

def replay_payloads(payloads, options, &block)
  handler = options[:on_warning]
  emit_wrapper_warnings(payloads, handler)

  if block_given?
    count = 0
    payloads.each do |payload|
      SmarterJSON.send(:process_content, payload[:slice], options) do |doc|
        block.call(doc)
        count += 1
      end
    end
    return count
  end

  # Each payload's process_content now returns an Array of its documents; flatten
  # so several recovered payloads yield one flat Array<doc> (the always-array
  # contract), not an Array of Arrays.
  payloads.flat_map do |payload|
    SmarterJSON.send(:process_content, payload[:slice], options)
  end
end

.substantive_text?(text) ⇒ Boolean

Returns:

  • (Boolean)


833
834
835
836
837
838
839
840
# File 'lib/smarter_json/parser.rb', line 833

def substantive_text?(text)
  return false if text.nil? || text.empty?

  stripped = text.dup
  stripped.gsub!(%r{/\*.*?\*/}m, "")
  stripped.gsub!(/^\s*(?:#|\/\/).*$/, "")
  !stripped.strip.empty? && !stripped.strip.match?(/\A(?:```[a-zA-Z0-9_-]*)?\z/) && !stripped.strip.match?(/\A(?:<\/?json>|BEGIN_JSON|END_JSON)\z/i)
end

.warn(handler, type, message, line, col) ⇒ Object



842
843
844
# File 'lib/smarter_json/parser.rb', line 842

def warn(handler, type, message, line, col)
  handler.call(Warning.new(type, message, line, col))
end

.wrapper_meta(input, ranges) ⇒ Object



776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
# File 'lib/smarter_json/parser.rb', line 776

def wrapper_meta(input, ranges)
  return { prefix: false, suffix: false, fence: false, wrapper: false } if ranges.empty?

  first = ranges.first
  last = ranges.last
  prefix = input.byteslice(0, first.begin)
  suffix = input.byteslice(last.end, input.bytesize - last.end)
  # Look for fence / wrapper markers only in the text we actually strip (outside
  # every recovered payload), so a ``` or <json> sitting inside a payload's own
  # string value does not trigger a "stripped a wrapper" warning.
  outside = non_payload_text(input, ranges)
  {
    prefix: substantive_text?(prefix),
    suffix: substantive_text?(suffix),
    fence: outside.include?("```"),
    wrapper: outside.match?(/<json\b|BEGIN_JSON\b/i),
    first_pos: line_col_for(input, first.begin),
    last_pos: line_col_for(input, last.begin)
  }
end