1lexer grammar t012lexerXMLLexer; 2options { 3 language = Python; 4} 5 6@header { 7from cStringIO import StringIO 8} 9 10@lexer::init { 11self.outbuf = StringIO() 12} 13 14@lexer::members { 15def output(self, line): 16 self.outbuf.write(line.encode('utf-8') + "\n") 17} 18 19DOCUMENT 20 : XMLDECL? WS? DOCTYPE? WS? ELEMENT WS? 21 ; 22 23fragment DOCTYPE 24 : 25 '<!DOCTYPE' WS rootElementName=GENERIC_ID 26 {self.output("ROOTELEMENT: "+rootElementName.text)} 27 WS 28 ( 29 ( 'SYSTEM' WS sys1=VALUE 30 {self.output("SYSTEM: "+sys1.text)} 31 32 | 'PUBLIC' WS pub=VALUE WS sys2=VALUE 33 {self.output("PUBLIC: "+pub.text)} 34 {self.output("SYSTEM: "+sys2.text)} 35 ) 36 ( WS )? 37 )? 38 ( dtd=INTERNAL_DTD 39 {self.output("INTERNAL DTD: "+dtd.text)} 40 )? 41 '>' 42 ; 43 44fragment INTERNAL_DTD : '[' (options {greedy=false;} : .)* ']' ; 45 46fragment PI : 47 '<?' target=GENERIC_ID WS? 48 {self.output("PI: "+target.text)} 49 ( ATTRIBUTE WS? )* '?>' 50 ; 51 52fragment XMLDECL : 53 '<?' ('x'|'X') ('m'|'M') ('l'|'L') WS? 54 {self.output("XML declaration")} 55 ( ATTRIBUTE WS? )* '?>' 56 ; 57 58 59fragment ELEMENT 60 : ( START_TAG 61 (ELEMENT 62 | t=PCDATA 63 {self.output("PCDATA: \""+$t.text+"\"")} 64 | t=CDATA 65 {self.output("CDATA: \""+$t.text+"\"")} 66 | t=COMMENT 67 {self.output("Comment: \""+$t.text+"\"")} 68 | pi=PI 69 )* 70 END_TAG 71 | EMPTY_ELEMENT 72 ) 73 ; 74 75fragment START_TAG 76 : '<' WS? name=GENERIC_ID WS? 77 {self.output("Start Tag: "+name.text)} 78 ( ATTRIBUTE WS? )* '>' 79 ; 80 81fragment EMPTY_ELEMENT 82 : '<' WS? name=GENERIC_ID WS? 83 {self.output("Empty Element: "+name.text)} 84 ( ATTRIBUTE WS? )* '/>' 85 ; 86 87fragment ATTRIBUTE 88 : name=GENERIC_ID WS? '=' WS? value=VALUE 89 {self.output("Attr: "+name.text+"="+value.text)} 90 ; 91 92fragment END_TAG 93 : '</' WS? name=GENERIC_ID WS? '>' 94 {self.output("End Tag: "+name.text)} 95 ; 96 97fragment COMMENT 98 : '<!--' (options {greedy=false;} : .)* '-->' 99 ; 100 101fragment CDATA 102 : '<![CDATA[' (options {greedy=false;} : .)* ']]>' 103 ; 104 105fragment PCDATA : (~'<')+ ; 106 107fragment VALUE : 108 ( '\"' (~'\"')* '\"' 109 | '\'' (~'\'')* '\'' 110 ) 111 ; 112 113fragment GENERIC_ID 114 : ( LETTER | '_' | ':') 115 ( options {greedy=true;} : LETTER | '0'..'9' | '.' | '-' | '_' | ':' )* 116 ; 117 118fragment LETTER 119 : 'a'..'z' 120 | 'A'..'Z' 121 ; 122 123fragment WS : 124 ( ' ' 125 | '\t' 126 | ( '\n' 127 | '\r\n' 128 | '\r' 129 ) 130 )+ 131 ; 132 133