Class: REXML::Parsers::BaseParser

Inherits:
Object
  • Object
show all
Defined in:
lib/compat/opal/rexml/parsers/baseparser.rb

Overview

Using the Pull Parser

This API is experimental, and subject to change.

parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
while parser.has_next?
  res = parser.next
  puts res[1]['att'] if res.start_tag? and res[0] == 'b'
end

See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.

Notice that:

parser = PullParser.new( "<a>BAD DOCUMENT" )
while parser.has_next?
  res = parser.next
  raise res[1] if res.error?
end

Nat Price gave me some good ideas for the API.

Constant Summary collapse

LETTER =
"A-Za-z"
DIGIT =
"0-9"
COMBININGCHAR =

TODO

""
EXTENDER =

TODO

""
NCNAME_STR =
"[#{LETTER}_][-A-Za-z0-9._#{COMBININGCHAR}#{EXTENDER}]*".freeze
QNAME_STR =
"(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})".freeze
QNAME =
/(#{QNAME_STR})/
UNAME_STR =

Just for backward compatibility. For example, kramdown uses this. It’s not used in REXML.

"(?:#{NCNAME_STR}:)?#{NCNAME_STR}".freeze
NAMECHAR =
'[\-\w\.:]'
NAME =
"([\\w:]#{NAMECHAR}*)".freeze
NMTOKEN =
"(?:#{NAMECHAR})+".freeze
NMTOKENS =
"#{NMTOKEN}(\\s+#{NMTOKEN})*".freeze
REFERENCE =
"&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)".freeze
REFERENCE_RE =
/#{REFERENCE}/
DOCTYPE_START =
/^\s*<!DOCTYPE\s/um
DOCTYPE_END =
/^\s*\]\s*>/um
ATTRIBUTE_PATTERN =
/\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START =
/^<!--/u
COMMENT_PATTERN =
/<!--(.*?)-->/um
CDATA_START =
/^<!\[CDATA\[/u
CDATA_END =
/^\s*\]\s*>/um
CDATA_PATTERN =
/<!\[CDATA\[(.*?)\]\]>/um
XMLDECL_START =
/^<\?xml\s/u
XMLDECL_PATTERN =
/<\?xml\s+(.*?)\?>/um
INSTRUCTION_START =
/^<\?/u
INSTRUCTION_PATTERN =
/<\?#{NAME}(\s+.*?)?\?>/um
TAG_MATCH =
/^<((?:#{QNAME_STR}))/um
CLOSE_MATCH =
/^\s*<\/(#{QNAME_STR})\s*>/um
VERSION =
/\bversion\s*=\s*["'](.*?)['"]/um
ENCODING =
/\bencoding\s*=\s*["'](.*?)['"]/um
STANDALONE =
/\bstandalone\s*=\s*["'](.*?)['"]/um
ENTITY_START =
/^\s*<!ENTITY/
ELEMENTDECL_START =
/^\s*<!ELEMENT/um
ELEMENTDECL_PATTERN =
/^\s*(<!ELEMENT.*?)>/um
SYSTEMENTITY =
/^\s*(%.*?;)\s*$/um
ENUMERATION =
"\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)".freeze
NOTATIONTYPE =
"NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)".freeze
ENUMERATEDTYPE =
"(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))".freeze
ATTTYPE =
"(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})".freeze
ATTVALUE =
"(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')".freeze
DEFAULTDECL =
"(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))".freeze
ATTDEF =
"\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}".freeze
ATTDEF_RE =
/#{ATTDEF}/
ATTLISTDECL_START =
/^\s*<!ATTLIST/um
ATTLISTDECL_PATTERN =
/^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
TEXT_PATTERN =
/^([^<]*)/um
PUBIDCHAR =

Entity constants

"\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
SYSTEMLITERAL =
%{((?:"[^"]*")|(?:'[^']*'))}
PUBIDLITERAL =
%{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}.freeze
EXTERNALID =
"(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))".freeze
NDATADECL =
"\\s+NDATA\\s+#{NAME}".freeze
PEREFERENCE =
"%#{NAME};".freeze
ENTITYVALUE =
%{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}.freeze
PEDEF =
"(?:#{ENTITYVALUE}|#{EXTERNALID})".freeze
ENTITYDEF =
"(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))".freeze
PEDECL =
"<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>".freeze
GEDECL =
"<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>".freeze
ENTITYDECL =
/\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
NOTATIONDECL_START =
/^\s*<!NOTATION/um
EXTERNAL_ID_PUBLIC =
/^\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
EXTERNAL_ID_SYSTEM =
/^\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
PUBLIC_ID =
/^\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
EREFERENCE =
/&(?!#{NAME};)/
DEFAULT_ENTITIES =
{
  "gt" => [/&gt;/, "&gt;", ">", />/],
  "lt" => [/&lt;/, "&lt;", "<", /</],
  "quot" => [/&quot;/, "&quot;", '"', /"/],
  "apos" => [/&apos;/, "&apos;", "'", /'/],
}.freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(source) ⇒ BaseParser

Returns a new instance of BaseParser.



166
167
168
169
170
171
172
173
174
175
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 166

def initialize(source)
  self.stream = source
  @listeners = []
  @prefixes = Set.new
  @entity_expansion_count = 0
  @entity_expansion_limit = Security.entity_expansion_limit
  @entity_expansion_text_limit = Security.entity_expansion_text_limit
  @source.ensure_buffer
  @version = nil
end

Instance Attribute Details

#entity_expansion_countObject (readonly)

Returns the value of attribute entity_expansion_count.



181
182
183
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 181

def entity_expansion_count
  @entity_expansion_count
end

#entity_expansion_limit=(value) ⇒ Object (writeonly)

Sets the attribute entity_expansion_limit

Parameters:

  • value

    the value to set the attribute entity_expansion_limit to.



182
183
184
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 182

def entity_expansion_limit=(value)
  @entity_expansion_limit = value
end

#entity_expansion_text_limit=(value) ⇒ Object (writeonly)

Sets the attribute entity_expansion_text_limit

Parameters:

  • value

    the value to set the attribute entity_expansion_text_limit to.



182
183
184
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 182

def entity_expansion_text_limit=(value)
  @entity_expansion_text_limit = value
end

#sourceObject (readonly)

Returns the value of attribute source.



181
182
183
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 181

def source
  @source
end

Instance Method Details

#add_listener(listener) ⇒ Object



177
178
179
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 177

def add_listener(listener)
  @listeners << listener
end

#empty?Boolean

Returns true if there are no more events

Returns:

  • (Boolean)


210
211
212
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 210

def empty?
  (@source.empty? and @stack.empty?)
end

#entity(reference, entities) ⇒ Object



573
574
575
576
577
578
579
580
581
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 573

def entity(reference, entities)
  return unless entities

  value = entities[reference]
  return if value.nil?

  record_entity_expansion
  unnormalize(value, entities)
end

#has_next?Boolean

Returns true if there are more events. Synonymous with !empty?

Returns:

  • (Boolean)


215
216
217
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 215

def has_next?
  !(@source.empty? and @stack.empty?)
end

#normalize(input, entities = nil, entity_filter = nil) ⇒ Object

Escapes all possible entities



584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 584

def normalize(input, entities = nil, entity_filter = nil)
  copy = input.clone
  # Doing it like this rather than in a loop improves the speed
  copy.gsub!(EREFERENCE, "&amp;")
  if entities
    entities.each do |key, value|
      unless entity_filter && entity_filter.include?(entity)
        copy.gsub!(value, "&#{key};")
      end
    end
  end
  copy.gsub!(EREFERENCE, "&amp;")
  DEFAULT_ENTITIES.each_value do |value|
    copy.gsub!(value[3], value[1])
  end
  copy
end

#peek(depth = 0) ⇒ Object

Peek at the depth event in the stack. The first element on the stack is at depth 0. If depth is -1, will parse to the end of the input stream and return the last event, which is always :end_document. Be aware that this causes the stream to be parsed up to the depth event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.



231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 231

def peek(depth = 0)
  raise %[Illegal argument "#{depth}"] if depth < -1

  temp = []
  if depth == -1
    temp.push(pull) until empty?
  else
    while @stack.size + temp.size < depth + 1
      temp.push(pull)
    end
  end
  @stack += temp if temp.size.positive?
  @stack[depth]
end

#positionObject



200
201
202
203
204
205
206
207
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 200

def position
  if @source.respond_to? :position
    @source.position
  else
    # FIXME
    0
  end
end

#pullObject

Returns the next event. This is a PullEvent object.



247
248
249
250
251
252
253
254
255
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 247

def pull
  @source.drop_parsed_content

  pull_event.tap do |event|
    @listeners.each do |listener|
      listener.receive event
    end
  end
end

#resetObject



189
190
191
192
193
194
195
196
197
198
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 189

def reset
  @closed = nil
  @have_root = false
  @document_status = nil
  @tags = []
  @stack = []
  @entities = []
  @namespaces = { "xml" => Private::XML_PREFIXED_NAMESPACE }
  @namespaces_restore_stack = []
end

#stream=(source) ⇒ Object



184
185
186
187
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 184

def stream=(source)
  @source = SourceFactory.create_from(source)
  reset
end

#unnormalize(string, entities = nil, filter = nil) ⇒ Object

Unescapes all possible entities



603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 603

def unnormalize(string, entities = nil, filter = nil)
  rv = if string.include?("\r")
         string.gsub(Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n")
       else
         string.dup
       end
  matches = rv.scan(REFERENCE_RE)
  return rv if matches.empty?

  rv.gsub!(Private::CHARACTER_REFERENCES) do
    m = $1
    code_point = if m.start_with?("x")
                   Integer(m[1..], 16)
                 else
                   Integer(m, 10)
                 end
    [code_point].pack("U*")
  end
  matches.collect! { |x| x[0] }.compact!
  if filter
    matches.reject! do |entity_reference|
      filter.include?(entity_reference)
    end
  end
  if matches.size.positive?
    matches.tally.each do |entity_reference, n|
      entity_expansion_count_before = @entity_expansion_count
      entity_value = entity(entity_reference, entities)
      if entity_value
        if n > 1
          entity_expansion_count_delta =
            @entity_expansion_count - entity_expansion_count_before
          record_entity_expansion(entity_expansion_count_delta * (n - 1))
        end
        re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/
        rv.gsub!(re, entity_value)
        if rv.bytesize > @entity_expansion_text_limit
          raise "entity expansion has grown too large"
        end
      else
        er = DEFAULT_ENTITIES[entity_reference]
        rv.gsub!(er[0], er[2]) if er
      end
    end
    rv.gsub!(Private::DEFAULT_ENTITIES_PATTERNS["amp"], "&")
  end
  rv
end

#unshift(token) ⇒ Object

Push an event back on the head of the stream. This method has (theoretically) infinite depth.



221
222
223
# File 'lib/compat/opal/rexml/parsers/baseparser.rb', line 221

def unshift(token)
  @stack.unshift(token)
end