Module: SmarterJSON::Recovery
- Includes:
- Bytes
- Defined in:
- lib/smarter_json/parser.rb
Constant Summary
Constants included
from Bytes
Bytes::BACKSLASH, Bytes::COLON, Bytes::COMMA, Bytes::CR, Bytes::DOLLAR, Bytes::DOT, Bytes::DQUOTE, Bytes::HASH, Bytes::LBRACE, Bytes::LBRACKET, Bytes::LF, Bytes::LOWER_E, Bytes::LOWER_F, Bytes::LOWER_N, Bytes::LOWER_T, Bytes::LOWER_U, Bytes::LOWER_X, Bytes::MINUS, Bytes::NINE, Bytes::PLUS, Bytes::RBRACE, Bytes::RBRACKET, Bytes::SLASH, Bytes::SPACE, Bytes::SQUOTE, Bytes::STAR, Bytes::TAB, Bytes::UNDERSCORE, Bytes::UPPER_E, Bytes::UPPER_F, Bytes::UPPER_I, Bytes::UPPER_N, Bytes::UPPER_T, Bytes::UPPER_X, Bytes::ZERO
Class Method Summary
collapse
-
.candidate_ranges(input) ⇒ Object
-
.emit_wrapper_warnings(payloads, handler) ⇒ Object
-
.extract_payloads(input, options) ⇒ Object
-
.leading_label?(input) ⇒ Boolean
Whether the input opens with a bare "JSON:" / "Final answer:" label (which would otherwise parse, wrongly, as an implicit-root object keyed by the label).
-
.line_col_for(input, offset) ⇒ Object
-
.non_payload_text(input, ranges) ⇒ Object
-
.process_string(input, options, &block) ⇒ Object
-
.replay_payloads(payloads, options, &block) ⇒ Object
-
.substantive_text?(text) ⇒ Boolean
-
.warn(handler, type, message, line, col) ⇒ Object
-
.wrapper_meta(input, ranges) ⇒ Object
Class Method Details
.candidate_ranges(input) ⇒ Object
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
|
# File 'lib/smarter_json/parser.rb', line 846
def candidate_ranges(input)
ranges = []
stack = []
start_pos = nil
i = 0
mode = nil
while i < input.bytesize
b = input.getbyte(i)
if mode == :double
if b == BACKSLASH
i += 2
next
elsif b == DQUOTE
mode = nil
end
i += 1
next
elsif mode == :single
if b == BACKSLASH
i += 2
next
elsif b == SQUOTE
mode = nil
end
i += 1
next
elsif mode == :triple
if input.byteslice(i, 3) == "'''"
mode = nil
i += 3
else
i += 1
end
next
elsif mode == :line_comment
if [LF, CR].include?(b)
mode = nil
else
i += 1
next
end
elsif mode == :block_comment
if input.byteslice(i, 2) == "*/"
mode = nil
i += 2
else
i += 1
end
next
else
if input.byteslice(i, 2) == "//"
mode = :line_comment
i += 2
next
elsif input.byteslice(i, 2) == "/*"
mode = :block_comment
i += 2
next
elsif b == HASH
mode = :line_comment
i += 1
next
elsif b == DQUOTE
mode = :double
i += 1
next
elsif input.byteslice(i, 3) == "'''"
mode = :triple
i += 3
next
elsif b == SQUOTE
mode = :single
i += 1
next
elsif [LBRACE, LBRACKET].include?(b)
start_pos = i if stack.empty?
stack << b
elsif b == RBRACE
stack.pop if stack.last == LBRACE
if stack.empty? && start_pos
ranges << (start_pos...(i + 1))
start_pos = nil
end
elsif b == RBRACKET
stack.pop if stack.last == LBRACKET
if stack.empty? && start_pos
ranges << (start_pos...(i + 1))
start_pos = nil
end
end
end
i += 1
end
ranges
end
|
.emit_wrapper_warnings(payloads, handler) ⇒ Object
751
752
753
754
755
756
757
758
759
|
# File 'lib/smarter_json/parser.rb', line 751
def emit_wrapper_warnings(payloads, handler)
return unless handler
meta = payloads.first[:meta]
warn(handler, :prefix_text_ignored, "ignored non-JSON text before the payload", *meta[:first_pos]) if meta[:prefix]
warn(handler, :code_fence_stripped, "stripped markdown code fences around the payload", *meta[:first_pos]) if meta[:fence]
warn(handler, :wrapper_tag_stripped, "stripped wrapper tags around the payload", *meta[:first_pos]) if meta[:wrapper]
warn(handler, :suffix_text_ignored, "ignored non-JSON text after the payload", *meta[:last_pos]) if meta[:suffix]
end
|
761
762
763
764
765
766
767
768
769
770
771
772
773
774
|
# File 'lib/smarter_json/parser.rb', line 761
def (input, options)
payloads = candidate_ranges(input).filter_map do |range|
slice = input.byteslice(range.begin, range.end - range.begin)
begin
SmarterJSON.send(:process_content, slice, options.merge(on_warning: nil))
{ slice: slice, range: range }
rescue ParseError
nil
end
end
meta = wrapper_meta(input, payloads.map { |p| p[:range] })
payloads.each { |payload| payload[:meta] = meta }
payloads
end
|
.leading_label?(input) ⇒ Boolean
Whether the input opens with a bare "JSON:" / "Final answer:" label (which would
otherwise parse, wrongly, as an implicit-root object keyed by the label). We use
String#start_with? with a Regexp rather than match?(/\A.../): start_with? checks
only the beginning, whereas a \A-anchored match? still retries at every byte
position and so scans the WHOLE input (≈0.3s on a 200 MB document) on every parse.
(Caller has already established the input is valid_encoding?.)
724
725
726
|
# File 'lib/smarter_json/parser.rb', line 724
def leading_label?(input)
input.start_with?(/[[:space:]]*(?:JSON|Final answer)[[:space:]]*:/i)
end
|
.line_col_for(input, offset) ⇒ Object
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
|
# File 'lib/smarter_json/parser.rb', line 808
def line_col_for(input, offset)
line = 1
col = 1
i = 0
while i < offset
b = input.getbyte(i)
break if b.nil?
if b == LF
line += 1
col = 1
i += 1
elsif b == CR
line += 1
col = 1
i += 1
i += 1 if i < offset && input.getbyte(i) == LF
else
col += 1
i += 1
end
end
[line, col]
end
|
.non_payload_text(input, ranges) ⇒ Object
797
798
799
800
801
802
803
804
805
806
|
# File 'lib/smarter_json/parser.rb', line 797
def non_payload_text(input, ranges)
out = +""
pos = 0
ranges.each do |range|
out << input.byteslice(pos, range.begin - pos) if range.begin > pos
pos = range.end
end
out << input.byteslice(pos, input.bytesize - pos) if pos < input.bytesize
out
end
|
.process_string(input, options, &block) ⇒ Object
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
|
# File 'lib/smarter_json/parser.rb', line 672
def process_string(input, options, &block)
input = SmarterJSON.send(:normalize_default_encoding, input, options)
if (target_enc = SmarterJSON.send(:unscannable_encoding, input))
target_enc = SmarterJSON.send(:concrete_unicode_encoding, input, target_enc) opts = options.merge(encoding: nil) utf8 = SmarterJSON.send(:to_utf8_copy, input) replace = options[:replace_char]
if block
return process_string(utf8, opts) { |doc| block.call(SmarterJSON.send(:deep_encode, doc, target_enc, replace)) }
end
return process_string(utf8, opts).map { |doc| SmarterJSON.send(:deep_encode, doc, target_enc, replace) }
end
return SmarterJSON.send(:process_content, input, options, &block) unless input.valid_encoding?
if leading_label?(input)
payloads = (input, options)
return replay_payloads(payloads, options, &block) unless payloads.empty?
end
SmarterJSON.send(:process_content, input, options, &block)
rescue ParseError => e
raise if e.is_a?(EncodingError)
payloads = (input, options)
return replay_payloads(payloads, options, &block) unless payloads.empty?
raise
end
|
.replay_payloads(payloads, options, &block) ⇒ Object
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
|
# File 'lib/smarter_json/parser.rb', line 728
def replay_payloads(payloads, options, &block)
handler = options[:on_warning]
emit_wrapper_warnings(payloads, handler)
if block_given?
count = 0
payloads.each do |payload|
SmarterJSON.send(:process_content, payload[:slice], options) do |doc|
block.call(doc)
count += 1
end
end
return count
end
payloads.flat_map do |payload|
SmarterJSON.send(:process_content, payload[:slice], options)
end
end
|
.substantive_text?(text) ⇒ Boolean
833
834
835
836
837
838
839
840
|
# File 'lib/smarter_json/parser.rb', line 833
def substantive_text?(text)
return false if text.nil? || text.empty?
stripped = text.dup
stripped.gsub!(%r{/\*.*?\*/}m, "")
stripped.gsub!(/^\s*(?:#|\/\/).*$/, "")
!stripped.strip.empty? && !stripped.strip.match?(/\A(?:```[a-zA-Z0-9_-]*)?\z/) && !stripped.strip.match?(/\A(?:<\/?json>|BEGIN_JSON|END_JSON)\z/i)
end
|
.warn(handler, type, message, line, col) ⇒ Object
842
843
844
|
# File 'lib/smarter_json/parser.rb', line 842
def warn(handler, type, message, line, col)
handler.call(Warning.new(type, message, line, col))
end
|
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
|
# File 'lib/smarter_json/parser.rb', line 776
def wrapper_meta(input, ranges)
return { prefix: false, suffix: false, fence: false, wrapper: false } if ranges.empty?
first = ranges.first
last = ranges.last
prefix = input.byteslice(0, first.begin)
suffix = input.byteslice(last.end, input.bytesize - last.end)
outside = non_payload_text(input, ranges)
{
prefix: substantive_text?(prefix),
suffix: substantive_text?(suffix),
fence: outside.include?("```"),
wrapper: outside.match?(/<json\b|BEGIN_JSON\b/i),
first_pos: line_col_for(input, first.begin),
last_pos: line_col_for(input, last.begin)
}
end
|