# $Id: N3Parser.py,v 1.17 2004/03/26 21:26:23 graham Exp $ # # GK version of N3 parser -- work in progress # # This parser follows a standard recursive-descent pattern, with a separate # tokenizer. The goal is that it will be easy to adapt to experiment with # new ideas for N3. # # TODO: # - add support for RDF containers (specifically rdf:li). # - recognize alternative directives # - add support for reading from web URI. # # THANKS TO: # - Sean Palmer for noticing some serious bugs in the initial release, and # pointing me to some useful test data. # #--------+---------+---------+---------+---------+---------+---------+---------+ # # Copyright (c) 2002, G. KLYNE # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # #--------+---------+---------+---------+---------+---------+---------+---------+ # $Source: /file/cvsdev/PythonN3/N3Parser.py,v $ # $Author: graham $ # $Revision: 1.17 $ $Date: 2004/03/26 21:26:23 $ #--------+---------+---------+---------+---------+---------+---------+---------+ # 1 2 3 4 5 6 7 8 import sys import string import StringIO from N3Exception import N3Exception from N3Node import N3Node, NameStartChar, NameChar from N3Statement import N3Statement """ N3document = directive* statement-list directive = "@prefix" prefix ":" uri-ref2 "." // Namespace declaration | "@prefix" ":" uri-ref2 "." // Default namespace | "@equivalence" uri-ref2 "." // Alternative to daml:equivalent | "@listfirst" uri-ref2 "." // Alternative to rdf:first | "@listrest" uri-ref2 "." // Alternative to rdf:rest | "@listnull" uri-ref2 "." // Alternative to rdf:nil | "@plus" uri-ref2 "." // Alternative to operator:plus | "@minus" uri-ref2 "." // Alternative to operator:minus | "@slash" uri-ref2 "." // Alternative to operator:slash | "@star" uri-ref2 "." // Alternative to operator:star statement-list = [ statement ( "." statement )* ] statement = subject property-list subject = node object = lit-node property-list = [ property ( ";" property )* ] property = verb object-list | ":-" anon-node // Creates anon-node aongside the current node verb = ">-" prop "->" // has 'prop' of | "<-" prop "<-" // is 'prop' of | operator // has operator:'operator' of (???) | prop // has 'prop' of -- shorthand | "has" prop "of" // has 'prop' of | "is" prop "of" // is 'prop' of | "a" // has rdf:type of | "=" // has daml:equivalent of object-list = object | object "," object-list anon-node = "[" property-list "]" // Something with given properties | "{" statement-list "}" // List of reified statements as resource | "(" node-list ")" // Construct list with // rdf:first, rdf:rest, rdf:nil prop = uri-ref2 operator = "+" // >- operator:plus -> | "-" // >- operator:minus -> | "/" // >- operator:slash -> | "*" // >- operator:star-> node-list = lit-node* lit-node = node | '"' constant-value '"' | '"""' constant value '"""' // Including single or double occurences of // quotes and/or newlines node = uri-ref2 | nodeid | anon-node uri-ref2 = qname | "<" URI-reference ">" | "this" qname = prefix ":" local-name prefix = name // Namespace prefix local-name = name // Local name (namespace qualified) name = alpha alphanumeric* alpha = "a"-"z" | "A"-"Z" | "_" alphanumeric = alpha | "0"-"9" URI-reference = (conforming to syntax in RFC2396) """ #--------+---------+---------+---------+---------+---------+---------+---------+ # Define N3 package global values true = 1 false = 0 MaxInputLen = 1000 # Input buffer length #--------+---------+---------+---------+---------+---------+---------+---------+ # Define N3Parser package exception conditions # # N3 lexical error (invalid token) # class N3LexicalError(N3Exception): """Exception raised for lexical errors in the input. Attributes: message -- explanation of the error """ def __init__(self, message): self.message = message # N3 syntax error # class N3SyntaxError(N3Exception): """Exception raised for syntax errors in the N3 input. Attributes: message -- explanation of the error """ def __init__(self, message): self.message = message # N3 directive error # class N3DirectiveError(N3Exception): """Exception raised for directive error in the N3 input. Attributes: message -- explanation of the error """ def __init__(self, message): self.message = message # N3 namespace error # class N3NamespaceError(N3Exception): """Exception raised for namespace prefix error in the N3 input. Attributes: message -- explanation of the error """ def __init__(self, message): self.message = message # N3 statement error # class N3StatementError(N3Exception): """Exception raised for statement error in the N3 input. Attributes: message -- explanation of the error """ def __init__(self, message): self.message = message #--------+---------+---------+---------+---------+---------+---------+---------+ # Main parser class # class N3Parser: """ Read and parse N3 to a parse tree. This class builds an in-memory parse tree but the tree handling methods may be overridden to do other things with the N3 code, including creation and addition of RDF triples to some database. N3 code is read from a supplied file-like object using the 'read()' method, with or without size parameter. As the N3 code is read, a left-to-right parse tree traversal is and corresponding methods are called. Normal behaviour of this class is to store the parse tree in memory, returned as a result of the Parse() method. The tree is made from N3Node and N3Statement objects. Method Parse accepts an input stream and returns a formula node that contains the parsed N3 formula from the input stream. Parameters: DefaultPrefix specify 0 or None if no default prefixes are to be supplied, or 1 is only those prefixes corresponding to predefined URIs are to be supplied. By default, a range of default prefix values are supplied. """ def __init__( self, DefaultPrefix=2 ): """ Initialize RDFParser object to read and parse a supplied input file. """ # Default default prefix ### self.DefaultPrefix = "#" # Table of defined prefix values if ( DefaultPrefix == None ) or ( DefaultPrefix == 0 ): self.PrefixTable = {} elif DefaultPrefix == 1: self.PrefixTable = \ { "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "n3": "http://id.ninebynine.org/wip/2002/n3/", "operator": "http://id.ninebynine.org/wip/2002/operator/" } else: self.PrefixTable = \ { "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "n3": "http://id.ninebynine.org/wip/2002/n3/", "rdfc": "http://id.ninebynine.org/wip/2002/rdfc/", "operator": "http://id.ninebynine.org/wip/2002/operator/", "daml": "http://www.daml.org/2000/10/daml-ont#", "logic": "http://www.w3.org/2000/10/swap/log.n3#", "": "http://id.ninebynine.org/default/" } # Table for default-setting directives self.DefaultURI = \ { "a": "rdf:type", "equals": "n3:equivalent", "listfirst": "rdf:first", "listrest": "rdf:rest", "listnull": "rdf:nil", "contains": "rdfs:contains", "plus": "operator:plus", "minus": "operator:minus", "slash": "operator:slash", "star": "operator:star" } # Table of reserved property keywords self.Keywords = \ ( "a", "has", "is", "of", "this" ) # Table of directive names self.Directives = \ ( "@prefix", "@equivalence", "@listfirst", "@listrest", "@listnull", "@contains", "@plus", "@minus", "@slash", "@star" ) self.SpecialURI = self.DefaultURI.copy() # Init copy of special URIs return def Parse( self, Input ): """ Parse the supplied Input file using the current parser state. On syntax error, raise a SyntaxError exception. Returns a formula node corresponding to the parsed document """ self.SpecialURI = self.DefaultURI.copy() # Reset special URIs self.ErrOut = sys.stdout # Default stream for error reports self.LineNum = 1 # Line counter self.CharPos = 0 # Character position self.LineBuf = [] # Line buffer (so far) self.LinePrv = [] # Previous line buffer self.Input = Input # Save input file object self.EndInput = None # self.LineNo = 0 # Initialize line counter self.CharNo = 0 # Initialise character counter self.EOFCount = 0 # (counter to break EOF loops) self.NextToken = None # Initialize next (current) token self.TokenType = None # Initialze token type self.TermToken = ' ' # Initialze token terminator self.InputBuffer = "" # Initialze input buffer self.InputCursor = 0 # Initialze input buffer cursor self.ScanToken() # Get next token self.ErrorSeen = None # No error seen return self.ParseN3Document() # Parse document def getPrefixTable( self ): """ Return prefix table, as a dictionary. """ return self.PrefixTable def getSpecialURI( self ): """ Return table of special URIs, as a dictionary """ return self.SpecialURI def MakeNode( self, QName ): """ Construct a node given a QName """ i = QName.find( ":" ) p = QName[:i] l = QName[i+1:] if not p in self.PrefixTable.keys(): self.NamespaceError( "Undefined prefix: "+p ) n = "//??Unknown-namespace//prefix="+p+"//" else: n = self.PrefixTable[p] return N3Node( ns=n, name=l ) def getQName( self, Node ): """ Return QName for a node, or None """ return Node.getQName( self.PrefixTable ) def getNodeName( self, Node ): """ Return convenient description for a node: QName, string or blank node id """ return Node.getNodeName( self.PrefixTable ) #--------------------------------------------------------------------------- # Syntax analysis methods #--------------------------------------------------------------------------- # This method parses a full N3 document from the input stream. # # N3document = directive* statement-list # # Return: # A formula node containing the parsed N3 document # def ParseN3Document( self ): while ( self.TokenType == "@directive" ) and ( not self.EndInput ): self.ParseN3Directive() Formula = None while not self.EndInput: Formula = self.ParseN3StatementList( Formula ) if ( self.TokenType != "." ) and ( not self.EndInput ): self.SyntaxError( "Expected '.' or end of input" ) while ( self.TokenType != "." ) and ( not self.EndInput ): self.ScanToken() if ( self.TokenType == "." ): self.ScanToken() if self.ErrorSeen: Formula = None return Formula # Parse an N3 directive. # # Currently, only @prefix is recognized # # directive = "@prefix" prefix ":" uri-ref2 "." # | "@prefix" ":" uri-ref2 "." # # This method advances over the directive and concluding period. # def ParseN3Directive( self ): try: Token = self.NextToken self.ScanToken() if Token == "@prefix": if self.TokenType != ":": raise N3DirectiveError( "Prefix ('name:') expected" ) PrefixName = self.NextToken self.ScanToken() if self.TokenType != "@uriref": raise N3DirectiveError( "URI reference expected" ) PrefixURI = self.NextToken self.PrefixTable[PrefixName] = PrefixURI ### self.debug( "@prefix "+PrefixName+": <"+PrefixURI+">" ) else: raise N3DirectiveError( "Unrecognized: " + Token ) self.ScanToken() if self.TokenType != ".": raise N3DirectiveError( "'.' expected" ) except N3DirectiveError, e: # Report error, scan to next "." self.SyntaxError( "N3 directive error: " + e.message ) while ( self.TokenType != "." ) and ( not self.EndInput ): self.ScanToken() # Scan past '.' self.ScanToken() return # Parse an N3 statement list, and return a formula node. # # statement-list = [ statement ( "." statement )* ] # # Parameters: # Formula a formula node to which the statements are collected, # or None. # # Return: # A formula node for the statement list -- the supplied parameter # if it is non-null. # def ParseN3StatementList( self, Formula ): if Formula == None: Formula = N3Node() Formula.makeFormula() while ( ( self.TokenType == ":" ) or ( self.TokenType == "@name" ) or ( self.TokenType == "@uriref" ) or ( self.TokenType == "this" ) or ( self.TokenType in ('{','[','(') ) ): self.ParseN3Statement( Formula ) if ( self.TokenType != "." ): break self.ScanToken() return Formula # Parse an N3 statement, which may contain several RDF statements. # # statement = subject property-list # subject = node # property-list = [ property ( ";" property )* ] # property = verb object-list # | ":-" anon-node # verb = prop // has 'prop' of # | ">-" prop "->" // has 'prop' of # | "has" prop "of" // has 'prop' of # | "<-" prop "<-" // is 'prop' of # | "is" prop "of" // is 'prop' of # | "a" // has rdf:type of # | "=" // has daml:equivalent of # | operator // has operator:'operator' of # # This method advances over the statement, # but not any concluding period. # # Parameters: # Formula receives new statements constructed with parsed objects # # Return: # An formula node corresponding to the parsed statements. # def ParseN3Statement( self, Formula ): try: Subject = self.ParseN3Node( Formula ) if Subject == None: raise N3StatementError( "invalid subject" ) self.ParseN3PropertyList( Formula, Subject ) except N3StatementError, e: # Report error, scan to likely end of statement self.SyntaxError( "Error in statement: " + e.message ) while ( ( self.TokenType not in ('.',')',']','}') ) and ( not self.EndInput ) ): self.ScanToken() return # Parse an N3 property list, adding statements to a supplied formula. # # property-list = [ property ( ";" property )* ] # property = verb object-list # | ":-" anon-node # # Parameters: # Formula is the formula to which parsed properties are added # Subject is the subject for making statements from the parsed # properties. # def ParseN3PropertyList( self, Formula, Subject ): try: while not self.EndInput: if self.TokenType == ":-": self.ScanToken() self.ParseN3AnonNode( Formula, Subject ) else: Property = self.ParseN3Verb() if Property: self.ParseN3ObjectList( Formula, Subject, Property ) else: raise N3StatementError( "invalid property" ) if self.TokenType != ";": break self.ScanToken() except N3StatementError, e: # Report error, scan to likely end of property self.SyntaxError( "Error in statement: " + e.message ) while ( ( self.TokenType not in (';','.',')',']','}') ) and ( not self.EndInput ) ): self.ScanToken() return # Parse an N3 object list # # object-list = object # | object "," object-list # object = lit-node # subject = node # # This method advances over the list of objects, # but not the terminating token. # # Parameters: # Formula receives new statements constructed with parsed objects # Subject of statements to which object values are applied # Property of statements to which object values are applied # def ParseN3ObjectList( self, Formula, Subject, Property ): try: while not self.EndInput: Object = self.ParseN3LitNode( Formula ) if Object: Statement = N3Statement( Subject, Property, Object ) Formula.addStatement( Statement ) else: raise N3StatementError( "invalid object" ) if self.TokenType != ",": break self.ScanToken() except N3StatementError, e: # Report error, scan to likely end of object self.SyntaxError( "Error in statement: " + e.message ) while ( ( self.TokenType not in (',',';','.',')',']','}') ) and ( not self.EndInput ) ): self.ScanToken() return # Parse an anon node and return its value # # anon-node = "[" property-list "]" // Something with given properties # | "{" statement-list "}" // List of statements as resource # | "(" node-list ")" // Construct list with # // rdf:first, rdf:rest, rdf:nil # property-list = [ property ( ";" property )* ] # node-list = lit-node* # # Parameters: # Formula receives new statements constructed with parsed objects # Subject of a statement to which the properties are applied, # or None. If supplied, this value is returned. # # Return: # The node corresponding to the anonymouse resource. If the supplied # Subject value is not None, that is the value returned. # # Raises: # N3StatementError if a syntax error is detected. Reporting and # recovery is passed to the calling routine. # def ParseN3AnonNode( self, Formula, Subject ): if self.TokenType == "{": # statement list as formula at subject node # (The new statements are collected at the subject node, # no new statements are added to the containing formula.) if Subject == None: Subject = N3Node() Subject.makeFormula() self.ScanToken() self.ParseN3StatementList( Subject ) if self.TokenType != '}': raise N3StatementError( "expected '}'" ) elif self.TokenType == "[": # Property list applied to subject node, # with new statements added to the containing formula if Subject == None: Subject = N3Node() self.ScanToken() self.ParseN3PropertyList( Formula, Subject ) if self.TokenType != ']': raise N3StatementError( "expected ']'" ) elif self.TokenType == "(": # Value list as first/rest list with subject as head # For each list member, add appropriate first/rest statements # to the containing formula. The supplied subject is ignored # (i.e. new list nodes are always created). PropertyFirst = self.MakeNode( self.SpecialURI['listfirst'] ) PropertyRest = self.MakeNode( self.SpecialURI['listrest'] ) NodeNil = self.MakeNode( self.SpecialURI['listnull'] ) self.ScanToken() Prev, Next = None, Subject while ( ( self.TokenType not in (',',';','.',')',']','}') ) and ( not self.EndInput ) ): # On each pass: # Member is new list element # Next is new blank list node # (may be original subject on first pass) # Prev is previous list node to link from # (always null on 1st pass) Member = self.ParseN3LitNode( Formula ) if Member == None: raise N3StatementError( "invalid list member" ) if not Next: Next = N3Node() s1 = N3Statement( Next, PropertyFirst, Member ) Formula.addStatement( s1 ) if Prev: s2 = N3Statement( Prev, PropertyRest, Next ) Formula.addStatement( s2 ) Subject = Subject or Next Prev = Next Next = None if Prev: s2 = N3Statement( Prev, PropertyRest, NodeNil ) Formula.addStatement( s2 ) else: Subject = NodeNil if self.TokenType != ')': raise N3StatementError( "expected ')'" ) else: raise N3StatementError( "expected anon node '{', '[' or '('" ) self.ScanToken() return Subject # Parse a literal or node, and return its description # # lit-node = node # | '"' constant-value '"' # | '"""' constant value '"""' # # Parameters: # Formula receives new statements constructed with parsed objects # # Return: # An N3Node corresponding to the parsed value, or None. # # Raises: # N3StatementError if a syntax error is detected. Reporting and # recovery is passed to the calling routine. # def ParseN3LitNode( self, Formula ): if self.TokenType == "@string": Node = N3Node( lit=self.NextToken ) self.ScanToken() else: Node = self.ParseN3Node( Formula ) return Node # Parse a node and return its description # # node = uri-ref2 # | nodeid # | anon-node # # Parameters: # Formula receives new statements constructed with parsed objects # # Return: # An N3Node corresponding to the parsed value, or None. # # Raises: # N3StatementError if a syntax error is detected. Reporting and # recovery is passed to the calling routine. # def ParseN3Node( self, Formula ): if ( self.TokenType in ('{','[','(') ): Node = self.ParseN3AnonNode( Formula, None ) elif (self.TokenType == ":") and (self.NextToken == "_"): # nodeid here; name must follow. self.ScanToken( KeywordTable=None ) if ( self.TokenType == "@name" ): ####self.debug( "nodeid: "+self.NextToken ) Node = N3Node( extid=self.NextToken ) self.ScanToken() else: raise N3StatementError( "Expected node identifier string (_:id)" ) else: Node = self.ParseN3Uriref( Formula ) return Node # Parse a verb and return its description # # verb = ">-" prop "->" // has 'prop' of # | "<-" prop "<-" // is 'prop' of # | operator // has operator:'operator' of (???) # | "has" prop "of" // has 'prop' of # | "is" prop "of" // is 'prop' of # | "a" // has rdf:type of # | "=" // has daml:equivalent of # | prop // has 'prop' of -- shorthand # operator = "+" // >- operator:plus -> # | "-" // >- operator:minus -> # | "/" // >- operator:slash -> # | "*" // >- operator:star-> # prop = uri-ref2 # # Return: # An N3Node corresponding to the parsed value # # Raises: # N3StatementError if a syntax error is detected. Reporting and # recovery is passed to the calling routine. # def ParseN3Verb( self ): if self.TokenType == '>-': self.ScanToken() Verb = self.ParseN3Uriref( None ) if Self.TokenType != '->': raise N3StatementError( "expected '->'" ) self.ScanToken() elif self.TokenType == '<-': self.ScanToken() Verb = self.ParseN3Uriref( None ) if Self.TokenType != '<-': raise N3StatementError( "expected '<-'" ) self.ScanToken() elif self.TokenType == '+': Verb = self.MakeNode( self.SpecialURI['plus'] ) self.ScanToken() elif self.TokenType == '-': Verb = self.MakeNode( self.SpecialURI['minus'] ) self.ScanToken() elif self.TokenType == '*': Verb = self.MakeNode( self.SpecialURI['star'] ) self.ScanToken() elif self.TokenType == '/': Verb = self.MakeNode( self.SpecialURI['slash'] ) self.ScanToken() elif self.TokenType == '=': Verb = self.MakeNode( self.SpecialURI['equals'] ) self.ScanToken() elif self.TokenType == 'has': self.ScanToken() Verb = self.ParseN3Uriref( None ) self.CheckVerbKeyword() if Self.TokenType != 'of': raise N3StatementError( "expected 'of'" ) self.ScanToken() elif self.TokenType == 'is': self.ScanToken() Verb = self.ParseN3Uriref( None ) Verb.setReversed( 1 ) # self.CheckVerbKeyword() if Self.TokenType != 'of': raise N3StatementError( "expected 'of'" ) self.ScanToken() elif self.TokenType == 'a': Verb = self.MakeNode( self.SpecialURI['a'] ) self.ScanToken() else: Verb = self.ParseN3Uriref( None ) if Verb == None: raise N3StatementError( "invalid property" ) return Verb # Parse a qname or URI node name and return its description # # uri-ref2 = qname # | "<" URI-reference ">" # | "this" # # qname = [ prefix ] ":" local-name # # Parameters: # Formula a node that contains the formula currently being # parsed (is represented by occurrences of 'this'). # # Return: # An N3Node corresponding to the parsed identifier, or None # # Raises: # N3StatementError if a syntax error is detected. Reporting and # recovery is passed to the calling routine. # def ParseN3Uriref( self, Formula ): if ( self.TokenType == '@uriref' ): Node = N3Node( uri=self.NextToken ) self.ScanToken() elif self.TokenType == ':': pre = self.NextToken self.ScanToken( KeywordTable=None ) if self.TokenType != '@name': raise N3StatementError( "Invalid node ref: name expected" ) Node = self.MakeNode( pre + ':' + self.NextToken ) self.ScanToken() elif self.TokenType == 'this': if not Formula: self.SyntaxError( "'this' used where or qname expected" ) Node = Formula self.ScanToken() else: raise N3StatementError( "Invalid node: or qname expected" ) return Node #--------------------------------------------------------------------------- # Lexical scanning methods #--------------------------------------------------------------------------- # Scan text token from input # # Token is: # alpha alphanumeric* ':' // (prefix) alpha includes "_" # ':' // (default prefix) # alphanumeric+ # reserved-keyword # string # '>-' # '->' # '<-' # '-<' # ':-' # char # # Parameters: # KeywordTable is a table of name values that may be recognized # as reserved keywords, '1' to use the default keytword table # for this class, or None if no keywords are to be recognized # for this call. # This parameter allows a degree of context sensitivity for keyword # recognition. Note that a name immediately followed by a ':' is # always recognized as a prefix, not a keyword. # # The token string or value is loaded into self.NextToken. # The value "@name", "@number", "@string", "@directive", # "@uriref", "@end", or the token string itself is # loaded into self.TokenType. # If the token is a QName prefix, the token type is ':' and # the token value is the prefix name not including the ':'. # If the token is a recognized keyword, then the token type is # the keyword. # The token terminator character is loaded into self.TokenTerm # # The value returned is the token type # def ScanToken( self, KeywordTable=1 ): # Skip whitespace c = self.SkipWhitespace( self.TermToken ) self.NextToken = None if self.EndInput: self.TokenType = "@end" elif c in NameStartChar: # Name or keyword or prefix here self.NextToken = "" ### while c in NameChar: while ( c in NameChar ) or ( c == '-' ): self.NextToken += c c = self.NextChar() if c == ':': c = self.NextChar() self.TokenType = ":" elif ( KeywordTable == 1 ): if self.NextToken in self.Keywords: self.TokenType = self.NextToken elif KeywordTable: ### self.debug( "KeywordTable: "+str(KeywordTable) ) if ( self.NextToken in KeywordTable ): self.TokenType = self.NextToken else: self.TokenType = "@name" elif c in string.digits: # Name here self.NextToken = "" while c in NameChar: self.NextToken += c c = self.NextChar() self.TokenType = "@name" elif c == ':': c = self.NextChar() if c == '-': c = self.NextChar() self.NextToken = ":-" self.TokenType = self.NextToken else: self.NextToken = "" self.TokenType = ":" elif c == '"': # String here self.NextToken = "" c = self.AppendNextStringChar() if c == '"': # Null or triple-quote c = self.AppendNextStringChar() if c == '"': # triple-quote while not self.EndInput: c = self.AppendNextStringChar() while ( c != '"' ) and ( not self.EndInput ): c = self.AppendNextStringChar() c = self.NextStringChar() if c == '"': c = self.AppendNextStringChar() if c == '"': break ; else: self.NextToken += '""' else: self.NextToken += '"' self.NextToken += c # Closing triple-quote seen: skip last quote c = self.NextChar() else: # null string self.NextToken = "" else: # single quote while ( c != '"' ) and ( not self.EndInput ): c = self.AppendNextStringChar() c = self.NextChar() self.TokenType = "@string" elif c == '<': # scan uriref or '<-' c = self.NextChar() if c == '-': self.NextToken = "<-" self.TokenType = self.NextToken else: self.NextToken = "" while ( c != '>' ) and ( not self.EndInput ): if ( ord(c) < 32 ) or ( ord(c) >= 127 ) or self.EndInput: self.LexicalError( "Invalid character in URI" ) break self.NextToken += c c = self.NextChar() self.TokenType = "@uriref" c = self.NextChar() elif c == '>': c = self.NextChar() if c == '-': self.NextToken = ">-" c = self.NextChar() else: self.NextToken = ">" self.TokenType = self.NextToken elif c == '-': c = self.NextChar() if c == '<': self.NextToken = "<-" c = self.NextChar() elif c == '>': self.NextToken = ">-" c = self.NextChar() else: self.NextToken = "-" self.TokenType = self.NextToken elif c == '@': # scan directive self.NextToken = "@" c = self.NextChar() while c in NameChar: self.NextToken += c c = self.NextChar() if self.NextToken in self.Directives: self.TokenType = "@directive" else: self.LexicalError( "Unrecognized directive" ) else: self.NextToken = c self.TokenType = c c = self.NextChar() # common wrap-up self.TermToken = self.SkipWhitespace( c ) ### self.debug( "Next token : '"+str(self.NextToken)+"', type '"+str(self.TokenType)+"'" ) return self.TokenType # Scan next string character and append to self.NextToken, # decoding escape sequences as appropriate, # unless it is a quote in which case just return it. # The value returned is the character scanned, or # a two-character excape sequence. def AppendNextStringChar( self ): c = self.NextStringChar() if c == '"': return c self.AppendStringChar( c ) return c # Append supplied string character to self.NextToken. # The leading '\' of any escape sequence is dropped. def AppendStringChar( self, c ): if c[0] == '\\': c = c[1:] self.NextToken += c return # Scan next string character and return it, # decoding escape sequences as appropriate. # The value returned is the character scanned, or # a two-character excape sequence, or a null string. def NextStringChar( self ): c = self.NextChar() if c == '\\': # Process escape e = self.NextChar() if e == '\n': NextTokenChar = '' # ignore \, newline elif e == '\\': NextTokenChar = '\\'+e elif e == "'": NextTokenChar = '\\'+e elif e == '"': NextTokenChar = '\\'+e elif e == 'n': NextTokenChar = '\\\n' elif e == 'r': NextTokenChar = '\\\r' elif e == 't': NextTokenChar = '\\\t' elif e == 'u': v = ScanHexNumber( 4 ) NextTokenChar = '\\'+unichr( v ) elif e == 'U': v = ScanHexNumber( 8 ) NextTokenChar = '\\'+unichr( v ) else: NextTokenChar = '\\'+e else: if ( c == "\n" ) or ( c == "\r" ) or ( c == "\t" ): NextTokenChar = c elif ( ord(c) < 32 ) or ( ord(c) >= 127 ) or self.EndInput: self.LexicalError( "Invalid character in string (" + str(ord(c)) + ")" ) while ( ( c != "'" ) and ( c != '"' ) and ( not self.EndInput ) ): c = self.NextChar() NextTokenChar = '' else: NextTokenChar = c return NextTokenChar # Scan hexadecimal number of exact number of digits and return its value def ScanHexNumber( self, count ): v = 0 for i in range( count ): d = self.NextChar() n = string.find( string.hexdigits, d ) if n < 0 : self.LexicalError( "Invalid escape sequence in string" ) break if n >= 16: n -= 6 v = v*16 + n return v # Skip whitespace and return terminating character # # Parameter: # c the next input character # # Returns: # The next non-whitespace, non-comment input character # def SkipWhitespace( self, c ): while ( c in string.whitespace ) and ( not self.EndInput ): c = self.NextChar() if ( c == '#' ): # Skip comment while ( c != '\n' ) and ( not self.EndInput ): c = self.NextChar() return c # Return next character from input stream # set self.EndInput and return a space when end seen # Keep track of line number, character position and current # line for reporting purposes. def NextChar( self ): # NextChar full implementation follows, # to deal fully with input and line buffer management: i = self.InputCursor while ( i >= len( self.InputBuffer ) ): i = 0 if not self.EndInput: self.InputBuffer = self.Input.read( MaxInputLen ) if len( self.InputBuffer ) == 0: # end of file self.EndInput = true self.InputBuffer = ' ' # Hack to break out of indefinite looping at end of file # (This only happens if there are code bugs) self.EOFCount += 1 if self.EOFCount > 1000000: raise IOError( "End of file" ) self.InputCursor = i+1 self.LineBuf.append( self.InputBuffer[i] ) self.CharPos += 1 if ( self.InputBuffer[i] == '\n' ): self.LinePrv = self.LineBuf self.LineBuf = [] self.CharPos = 0 self.LineNum += 1 return self.InputBuffer[i] #--------------------------------------------------------------------------- # Error reporting methiods #--------------------------------------------------------------------------- def SyntaxError( self, msg ): self.ReportError( "Syntax error", msg ) return def LexicalError( self, msg ): self.ReportError( "Lexical error", msg ) return def NamespaceError( self, msg ): self.ReportError( "Namespace error", msg ) return def ReportError( self, err, msg ): self.ErrOut.write( err+" at line "+str(self.LineNum)+ ", char "+str(self.CharPos)+"\n" ) self.ErrOut.write( msg+"\n" ) if self.LinePrv: for i in range(len(self.LinePrv)): self.ErrOut.write( self.LinePrv[i] ) for i in range(len(self.LineBuf)): self.ErrOut.write( self.LineBuf[i] ) self.ErrOut.write( "<<-- \n" ) self.ErrOut.write( "NextToken: "+str(self.NextToken)+ ", TokenType: "+str(self.TokenType)+"\n" ) self.ErrOut.write( "--------------------\n" ) self.ErrorSeen = err return def debug( self, msg ): sys.stdout.write( "N3Parser: "+msg+"\n" ) return # End of N3Parser # Stand-alone test code follows: if __name__ == '__main__': N3ShortInput = ''' @prefix rdf: . @prefix rdfs: . @prefix foaf: . @prefix hdr: . # temporary ??? a hdr:HeaderField ; hdr:fieldName "Content-features" ; hdr:protocol [ hdr:protocolName "mail" ; hdr:specification [ = ; hdr:document ] ] . _:foo hdr:prop "o1" . _:foo hdr:prop "o2" . ''' N3Input = ''' @prefix rdf: . @prefix rdfs: . @prefix foaf: . @prefix hdr: . # temporary ??? @prefix log: . this log:forAll , ; log:forSome , . hdr:form11 hdr:prop11 hdr:form11 ; :- { hdr:subj12 hdr:prop12 hdr:obj12 ; hdr:prop12 this } . hdr:subj21 hdr:prop21 { hdr:subj22 hdr:prop22 hdr:obj22 , this } . a hdr:HeaderField ; hdr:fieldName "Content-features" ; rdfs:label "Indicates content features of a MIME body part" ; hdr:protocol [ hdr:protocolName "mail" ; hdr:specification [ = ; hdr:document ] ] ; hdr:status "standards-track" ; hdr:author [ foaf:name "Graham Klyne" ; foaf:mbox "GK-headers@ninebynine.org" ; foaf:workplacePostal [ foaf:building "Clearswift Corporation" ; foaf:street """1310 Waterside, Arlington Business Park, Theale""" ; foaf:city "Reading" ; foaf:area "Berks" ; foaf:postcode "RG7 4SA" ; foaf:country "UK" ] ; foaf:workplaceTel "011 8903 8903" ; foaf:workplaceFax "011 8903 9000" ; foaf:workplaceHomepage ] ; hdr:specification [ = ; hdr:document ; hdr:section "3" ] ; rdfs:comment """The 'Content-features:' header can be used to annotate a MIME body part with a media feature expression, to indicate features of the body part content. "quoted" word. See also: RFC 2533, RFC 2506, RFC 2045.""" . ''' InputStr = StringIO.StringIO( N3ShortInput ) sys.stdout.write( "*"*60 + "\n" ) sys.stdout.write( "Calling parser\n" ) Parser = N3Parser() Formula = Parser.Parse( InputStr ) sys.stdout.write( "Displaying\n" ) for s in Formula.getStatements(): sys.stdout.write( Parser.getNodeName( s.getSubject() ) + " " ) sys.stdout.write( Parser.getNodeName( s.getProperty() ) + " " ) sys.stdout.write( Parser.getNodeName( s.getObject() ) + " .\n" ) p = Parser.getPrefixTable() # Display nested formulae for s in Formula.getStatements(): f = s.getSubject() if f.isFormula() and f != Formula: sys.stdout.write( "Formula "+f.getNodeName(p)+"\n { " ) for s1 in f.getStatements(): sys.stdout.write( Parser.getNodeName( s1.getSubject() ) + " " ) sys.stdout.write( Parser.getNodeName( s1.getProperty() ) + " " ) sys.stdout.write( Parser.getNodeName( s1.getObject() ) + " . " ) sys.stdout.write( " }\n" ) f = s.getObject() if f.isFormula() and f != Formula: sys.stdout.write( "Formula "+f.getNodeName(p)+"\n { " ) for s1 in f.getStatements(): sys.stdout.write( Parser.getNodeName( s1.getSubject() ) + " " ) sys.stdout.write( Parser.getNodeName( s1.getProperty() ) + " " ) sys.stdout.write( Parser.getNodeName( s1.getObject() ) + " . " ) sys.stdout.write( " }\n" ) sys.stdout.write( "Exiting\n" ) #--------+---------+---------+---------+---------+---------+---------+---------+ # # $Log: N3Parser.py,v $ # Revision 1.17 2004/03/26 21:26:23 graham # Bug-fixes imn report generation software to support new header # field registry document generation. Also added some generic # variable binding options to N3GenReport. # # Revision 1.16 2002/12/11 13:51:49 graham # Fix up some URI initialization problems # # Revision 1.15 2002/09/19 09:37:08 graham # Added facility to scan contents of list (using rdf:first, rdf:rest) # # Revision 1.14 2002/09/12 14:53:44 graham # Fixed quoted string in triple-quoted string lexer bug # # Revision 1.13 2002/05/07 15:55:39 graham # Fixed up handling of 'this' (refers to containing formula node) # # Revision 1.12 2002/05/07 10:54:16 graham # Fixes to accept N3 output: options to control default prefix table, accept 'this' as node name (treated as <#>), allow '-' in qnames (not strictly valid N3?) # # Revision 1.11 2002/05/03 14:50:47 graham # Fixed error recovery bug in lexical scanner # # Revision 1.10 2002/05/01 15:43:48 graham # Cleanup small details # # Revision 1.9 2002/04/28 17:01:45 graham # Partial port to Jython-21 (but having problems with os module support) # # Revision 1.8 2002/04/28 11:12:19 graham # Further improvements to N3 parser, and extend options for N3SyntaxCheck. # # Revision 1.7 2002/04/27 18:06:48 graham # Fix bugs in N3 parser, add N3 syntax checker program # # Revision 1.6 2002/04/26 09:43:52 graham # Updated TODO # # Revision 1.5 2002/04/25 19:00:13 graham # Update CVS keyword fields # # Revision 1.4 2002/04/25 18:53:26 graham # Add copyright and disclaimer notices # # Revision 1.3 2002/04/24 14:36:33 graham # Save edits # # Revision 1.2 2002/04/23 17:09:57 graham # Basic query and report generation works # # Revision 1.1 2002/04/19 22:07:39 graham # Separated into smaller modules; basic N3Model functions working # (Modules renamed, hence version number reset) # # Revision 1.3 2002/04/18 07:39:42 graham # Added TODO # # Revision 1.2 2002/04/17 23:41:31 graham # Basic N3 parser is working #