Using the Pull Parser
This API is experimental, and subject to change.
parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
while parser.has_next?
res = parser.next
puts res[1]['att'] if res.start_tag? and res[0] == 'b'
end
See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.
Notice that:
parser = PullParser.new( "<a>BAD DOCUMENT" )
while parser.has_next?
res = parser.next
raise res[1] if res.error?
end
Nat Price gave me some good ideas for the API.
- A
- E
- H
- N
- P
- S
- U
LETTER | = | '[:alpha:]' |
DIGIT | = | '[:digit:]' |
COMBININGCHAR | = | '' |
EXTENDER | = | '' |
NCNAME_STR | = | "[#{LETTER}_:][-[:alnum:]._:#{COMBININGCHAR}#{EXTENDER}]*" |
NAME_STR | = | "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" |
UNAME_STR | = | "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" |
NAMECHAR | = | '[\-\w\.:]' |
NAME | = | "([\\w:]#{NAMECHAR}*)" |
NMTOKEN | = | "(?:#{NAMECHAR})+" |
NMTOKENS | = | "#{NMTOKEN}(\\s+#{NMTOKEN})*" |
REFERENCE | = | "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" |
REFERENCE_RE | = | /#{REFERENCE}/ |
DOCTYPE_START | = | /\A\s*<!DOCTYPE\s/um |
DOCTYPE_PATTERN | = | /\s*<!DOCTYPE\s+(.*?)(\[|>)/um |
ATTRIBUTE_PATTERN | = | /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um |
COMMENT_START | = | /\A<!--/u |
COMMENT_PATTERN | = | /<!--(.*?)-->/um |
CDATA_START | = | /\A<!\[CDATA\[/u |
CDATA_END | = | /^\s*\]\s*>/um |
CDATA_PATTERN | = | /<!\[CDATA\[(.*?)\]\]>/um |
XMLDECL_START | = | /\A<\?xml\s/u; |
XMLDECL_PATTERN | = | /<\?xml\s+(.*?)\?>/um |
INSTRUCTION_START | = | /\A<\?/u |
INSTRUCTION_PATTERN | = | /<\?(.*?)(\s+.*?)?\?>/um |
TAG_MATCH | = | /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um |
CLOSE_MATCH | = | /^\s*<\/(#{NAME_STR})\s*>/um |
VERSION | = | /\bversion\s*=\s*["'](.*?)['"]/um |
ENCODING | = | /\bencoding\s*=\s*["'](.*?)['"]/um |
STANDALONE | = | /\bstandalone\s*=\s*["'](.*?)['"]/um |
ENTITY_START | = | /^\s*<!ENTITY/ |
IDENTITY | = | /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u |
ELEMENTDECL_START | = | /^\s*<!ELEMENT/um |
ELEMENTDECL_PATTERN | = | /^\s*(<!ELEMENT.*?)>/um |
SYSTEMENTITY | = | /^\s*(%.*?;)\s*$/um |
ENUMERATION | = | "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" |
NOTATIONTYPE | = | "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" |
ENUMERATEDTYPE | = | "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" |
ATTTYPE | = | "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" |
ATTVALUE | = | "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" |
DEFAULTDECL | = | "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" |
ATTDEF | = | "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" |
ATTDEF_RE | = | /#{ATTDEF}/ |
ATTLISTDECL_START | = | /^\s*<!ATTLIST/um |
ATTLISTDECL_PATTERN | = | /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um |
NOTATIONDECL_START | = | /^\s*<!NOTATION/um |
PUBLIC | = | /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um |
SYSTEM | = | /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um |
TEXT_PATTERN | = | /\A([^<]*)/um |
PUBIDCHAR | = | "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" |
Entity constants |
||
SYSTEMLITERAL | = | %Q{((?:"[^"]*")|(?:'[^']*'))} |
PUBIDLITERAL | = | %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} |
EXTERNALID | = | "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" |
NDATADECL | = | "\\s+NDATA\\s+#{NAME}" |
PEREFERENCE | = | "%#{NAME};" |
ENTITYVALUE | = | %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} |
PEDEF | = | "(?:#{ENTITYVALUE}|#{EXTERNALID})" |
ENTITYDEF | = | "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" |
PEDECL | = | "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" |
GEDECL | = | "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" |
ENTITYDECL | = | /\s*(?:#{GEDECL})|(?:#{PEDECL})/um |
EREFERENCE | = | /&(?!#{NAME};)/ |
DEFAULT_ENTITIES | = | { 'gt' => [/>/, '>', '>', />/], 'lt' => [/</, '<', '<', /</], 'quot' => [/"/, '"', '"', /"/], "apos" => [/'/, "'", "'", /'/] } |
MISSING_ATTRIBUTE_QUOTES | = | /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um |
These are patterns to identify common markup errors, to make the error messages more informative. |
[R] | source |
Returns true if there are no more events
Returns true if there are more events. Synonymous with !empty?
Escapes all possible entities
# File lib/rexml/parsers/baseparser.rb, line 457 def normalize( input, entities=nil, entity_filter=nil ) copy = input.clone # Doing it like this rather than in a loop improves the speed copy.gsub!( EREFERENCE, '&' ) entities.each do |key, value| copy.gsub!( value, "&#{key};" ) unless entity_filter and entity_filter.include?(entity) end if entities copy.gsub!( EREFERENCE, '&' ) DEFAULT_ENTITIES.each do |key, value| copy.gsub!( value[3], value[1] ) end copy end
Peek at the depth
event in the stack. The first element on
the stack is at depth 0. If depth
is -1, will parse to the
end of the input stream and return the last event, which is always
:end_document. Be aware that this causes the stream to be parsed up to the
depth
event, so you can effectively pre-parse the entire
document (pull the entire thing into memory) using this method.
# File lib/rexml/parsers/baseparser.rb, line 167 def peek depth=0 raise %Q[Illegal argument "#{depth}"] if depth < -1 temp = [] if depth == -1 temp.push(pull()) until empty? else while @stack.size+temp.size < depth+1 temp.push(pull()) end end @stack += temp if temp.size > 0 @stack[depth] end
Returns the next event. This is a PullEvent
object.
Unescapes all possible entities
# File lib/rexml/parsers/baseparser.rb, line 473 def unnormalize( string, entities=nil, filter=nil ) rv = string.clone rv.gsub!( /\r\n?/, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') } matches.collect!{|x|x[0]}.compact! if matches.size > 0 matches.each do |entity_reference| unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value re = /&#{entity_reference};/ rv.gsub!( re, entity_value ) else er = DEFAULT_ENTITIES[entity_reference] rv.gsub!( er[0], er[2] ) if er end end end rv.gsub!( /&/, '&' ) end rv end