-- $Id: Lex.hs,v 1.20 2004/06/24 17:48:36 graham Exp $ -------------------------------------------------------------------------------- -- | You don't normally need to use this Lex module directly - it is -- called automatically by the parser. (This interface is only exposed -- for debugging purposes.) -- -- This is a hand-written lexer for tokenising the text of an XML -- document so that it is ready for parsing. It attaches position -- information in (line,column) format to every token. The main -- entry point is 'xmlLex'. A secondary entry point, 'xmlReLex', is -- provided for when the parser needs to stuff a string back onto -- the front of the text and re-tokenise it (typically when expanding -- macros). -- -- As one would expect, the lexer is essentially a small finite -- state machine. module Text.XML.HaXml.Lex ( -- * Entry points to the lexer xmlLex -- :: String -> String -> [Token] , xmlReLex -- :: Posn -> String -> [Token] , xmlLexTextDecl -- :: String -> Maybe Posn -> String -> [Token] , xmlLexGESub -- :: String -> String -> [Token] , xmlLexGESubAttr -- :: String -> String -> [Token] , xmlLexPESub -- :: Posn -> String -> [Token] , xmlLexEntitySub -- :: String -> Maybe Posn -> String -> [Token] , xmlLexEntity -- :: Posn -> String -> [Token] , xmlLexString -- :: Posn -> String -> [Token] -- * Token and position types , XMLToken, makeXMLToken, tokenPos, tokenVal, tokenSat , Posn(..), testPosn , TokenT(..), tokEQci , Special(..) , Section(..) ) where import Text.XML.HaXml.Unicode ( isXmlChar , isXmlNameChar , isXmlNameStartChar , isXmlSpaceChar ) import Text.ParserCombinators.Token ( Token(..), Position(..), TokenPos(..) ) import Char -- A list of 'Where' values are used to indicate the current -- context of token processing. In the table below, the -- 'where' column indicates the list value before encountering -- any of the tokens in the 'document' column. The final token -- in each line is the one that triggers a change in the 'where' -- value, noted on the next line. -- -- Where Document -- ----- -------- -- [] "] DOCTYPE blah blah > -- [] < -- [InTag "<...>",NotInTag] selfclosingtag /> -- [] < -- [InTag "<...>",NotInTag] elementtag > -- [NotInTag] freetext -- [] -- data Where = InTag String | NotInTag deriving (Eq) -- | All tokens are paired up with a source position. -- Lexical errors are passed back through the @Either@ type. type XMLToken = (Posn,Either String TokenT) instance TokenPos XMLToken where tokenValEq t1 t2 = tokenVal t1 == tokenVal t2 nextTokenValid = nextF (either (const False) tokenValid . snd) False nextTokenEof = nextF (either (const False) tokenEof . snd) True nextTokenPosEq = nextF2 (nextF tokenPos posEof) (==) nextTokenPosShow = nextF showPos "(EOF)" nextTokenErrShow = nextF (either id tokenErrShow . snd) "(at:EOF)" makeXMLToken :: Posn -> TokenT -> XMLToken makeXMLToken p t = (p,Right t) tokenPos :: XMLToken -> Posn tokenPos = fst tokenVal :: XMLToken -> TokenT tokenVal = either error id . snd tokenSat :: (TokenT->Bool) -> (XMLToken->Bool) tokenSat p (_,Right t) = p t tokenSat _ _ = False showPos :: XMLToken -> String showPos (p,Left e) = show p -- " (" ++ e ++ ")" -- lexical error becomes part of message showPos (p,Right t) = show p ++ " (found: " ++ show t ++ ")" {- showTok :: Either String TokenT -> String showTok (Left e) = e showTok (Right t) = show t -} -- Local helper function nextF :: (a->b) -> b -> [a] -> b nextF f _ (a:_) = f a nextF _ def _ = def nextF2 :: ([a]->b) -> (b->b->c) -> [a] -> [a] -> c nextF2 f comb a1 a2 = comb (f a1) (f a2) -- | Source positions contain a filename, line, column, and an -- inclusion point, which is itself another source position, -- recursively. data Posn = Pn String !Int !Int (Maybe Posn) | PEOF deriving (Eq) instance Show Posn where showsPrec p (Pn f l c i) = showString f . showString " line " . shows l . showString " col " . shows c . ( case i of Nothing -> id Just p -> showString "(used by " . shows p . showString ")") showsPrec p PEOF = showString "EOF" instance Position Posn where posEof = PEOF testPosn = Pn "Test" 1 1 Nothing -- | The basic token type. data TokenT = TokCommentOpen -- ^ \ | TokPIOpen -- ^ \ | TokSectionOpen -- ^ \ | TokSection Section -- ^ CDATA INCLUDE IGNORE etc | TokSpecialOpen -- ^ \ | TokAnyOpen -- ^ \< | TokAnyClose -- ^ > | TokSqOpen -- ^ \[ | TokSqClose -- ^ \] | TokEqual -- ^ = | TokQuery -- ^ ? | TokStar -- ^ \* | TokPlus -- ^ + | TokAmp -- ^ & | TokSemi -- ^ ; | TokHash -- ^ # | TokBraOpen -- ^ ( | TokBraClose -- ^ ) | TokPipe -- ^ | | TokPercent -- ^ % | TokComma -- ^ , | TokQuote -- ^ \'\' or \"\" | TokName String -- ^ begins with letter | TokFreeText String -- ^ any character data | TokNull -- ^ fake token deriving (Eq) data Special = DOCTYPEx | ELEMENTx | ATTLISTx | ENTITYx | NOTATIONx deriving (Eq,Show) data Section = CDATAx | INCLUDEx | IGNOREx deriving (Eq,Show) instance Show TokenT where showsPrec p TokCommentOpen = showString "" showsPrec p TokPIOpen = showString "" showsPrec p TokSectionOpen = showString "" showsPrec p (TokSection s) = showsPrec p s showsPrec p TokSpecialOpen = showString "" showsPrec p TokAnyOpen = showString "<" showsPrec p TokAnyClose = showString ">" showsPrec p TokSqOpen = showString "[" showsPrec p TokSqClose = showString "]" showsPrec p TokEqual = showString "=" showsPrec p TokQuery = showString "?" showsPrec p TokStar = showString "*" showsPrec p TokPlus = showString "+" showsPrec p TokAmp = showString "&" showsPrec p TokSemi = showString ";" showsPrec p TokHash = showString "#" showsPrec p TokBraOpen = showString "(" showsPrec p TokBraClose = showString ")" showsPrec p TokPipe = showString "|" showsPrec p TokPercent = showString "%" showsPrec p TokComma = showString "," showsPrec p TokQuote = showString "' or \"" showsPrec p (TokName s) = showString ("TokName("++s++")") showsPrec p (TokFreeText s) = showString ("TokFreeText("++s++")") showsPrec p TokNull = showString "" -- |Case-insensitive matching of token value tokEQci :: TokenT -> TokenT -> Bool tokEQci (TokName s1) (TokName s2) = (map toLower s1) == (map toLower s2) tokEQci (TokFreeText s1) (TokFreeText s2) = (map toLower s1) == (map toLower s2) tokEQci t1 t2 = t1 == t2 -- This Token instance allows for a future move of information out of the -- TokenPos instance into the Token instance. instance Token TokenT where tokenValid = not . tokenError tokenEof = const False tokenError = (== TokNull) tokenErrShow t = "(valid token: " ++ show t ++ ")" --trim, revtrim :: String -> String --trim = f . f where f = reverse . dropWhile isSpace --revtrim = f.reverse.f where f = dropWhile isSpace --revtrim = reverse . dropWhile (=='\n') -- most recently used defn. emit :: TokenT -> Posn -> XMLToken emit tok p = forcep p `seq` (p,Right tok) lexerror :: String -> Posn -> [XMLToken] lexerror s p = [(p,Left s)] forcep (Pn f n m i) = m `seq` n addcol :: Int -> Posn -> Posn addcol n (Pn f r c i) = Pn f r (c+n) i newline, tab :: Posn -> Posn newline (Pn f r c i) = Pn f (r+1) 1 i tab (Pn f r c i) = Pn f r (((c`div`8)+1)*8) i white :: Char -> Posn -> Posn white ' ' = addcol 1 white '\n' = newline white '\r' = id white '\t' = tab white '\xa0' = addcol 1 -- Debug: white _ = (\p -> error ("white: match failure at: "++show p)) skip :: Int -> Posn -> String -> (Posn->String->[XMLToken]) -> [XMLToken] skip n p s k = k (addcol n p) (drop n s) blank :: ([Where]->Posn->String->[XMLToken]) -> [Where]-> Posn-> String-> [XMLToken] blank k (InTag t:_) p [] = lexerror ("unexpected EOF within "++t) p blank k _ p [] = [] blank k w p (' ': s) = blank k w (addcol 1 p) s blank k w p ('\t':s) = blank k w (tab p) s blank k w p ('\n':s) = blank k w (newline p) s blank k w p ('\r':s) = blank k w p s blank k w p ('\xa0': s) = blank k w (addcol 1 p) s blank k w p s = k w p s prefixes :: String -> String -> Bool [] `prefixes` ys = True (x:xs) `prefixes` (y:ys) = x==y && xs `prefixes` ys (x:xs) `prefixes` [] = False --error "unexpected EOF in prefix" accumulateUntil :: String -> TokenT -> String -> Posn -> Posn -> String -> (Posn -> String -> [XMLToken]) -> [XMLToken] -- target termtok accumstr startpos accumpos input -- continuation accumulateUntil (c:cs) tok acc pos p [] k = lexerror ("unexpected EOF while looking for closing token "++c:cs ++" to match the opening token in "++show pos) p accumulateUntil (c:cs) tok acc pos p (s:ss) k | c==s && cs `prefixes` ss = emit (TokFreeText (reverse acc)) pos: emit tok p: skip (length cs) (addcol 1 p) ss k | isXmlSpaceChar s = accumulateUntil (c:cs) tok (s:acc) pos (white s p) ss k | isXmlChar s = accumulateUntil (c:cs) tok (s:acc) pos (addcol 1 p) ss k | otherwise = lexerror ("(accumulateUntil) illegal character") p -- | @posInNewCxt name pos@ creates a new source position from an old one. -- It is used when opening a new file (e.g. a DTD inclusion), to denote -- the start of the file @name@, but retain the stacked information that -- it was included from the old @pos@. posInNewCxt :: String -> Maybe Posn -> Posn posInNewCxt name pos = Pn name 1 1 pos ------------------------------------------------------------ -- | The first argument to 'xmlLex' is the filename (used for source positions, -- especially in error messages), and the second is the string content of -- the XML file. xmlLex :: String -> String -> [XMLToken] xmlLex filename = xmlAny [] (posInNewCxt ("file "++filename) Nothing) -- | 'xmlReLex' is like xmlLex, except that it is used to re-tokenize -- nested content starting from a specified position, and skips any -- leading whitespace in the supplied input string xmlReLex :: Posn -> String -> [XMLToken] xmlReLex p s = blank xmlAny [] p s ------------------------------------------------------------ -- | xmlLexTextDecl is used to isolate a text declaration of an external -- entity from the rest of the input file, which is returned as a free -- text value. xmlLexTextDecl :: String -> Maybe Posn -> String -> [XMLToken] xmlLexTextDecl name pos s = textdecl s where textdecl ('<':'?':ss) = emit TokPIOpen p1 : xmlName p2 ss " or PI" (accCont "?>" TokPIClose xmlLexFreeText) textdecl ss = xmlLexFreeText p1 ss p1 = posInNewCxt name pos p2 = addcol 2 p1 accCont txt tok k p s = accumulateUntil txt tok "" p p s k xmlLexFreeText :: Posn -> String -> [XMLToken] xmlLexFreeText p s = [(p,Right $ TokFreeText s)] ------------------------------------------------------------ -- | xmlLexGESub is a variation of xmlLex used to tokenize general entity -- replacement text when the entity reference occurs in document free text. xmlLexGESub :: String -> String -> [XMLToken] xmlLexGESub entname = xmlAny [NotInTag] (posInNewCxt ("entity "++entname) Nothing) -- | xmlLexGESubAttr is used to tokenize general entity replacement text -- when the entity reference occurs in element attribute text. xmlLexGESubAttr :: String -> String -> [XMLToken] xmlLexGESubAttr entname = xmlLexString (posInNewCxt ("entity "++entname) Nothing) -- | 'xmlLexPESub' is used when the parser expands a macro (PE reference), -- where the parameter reference appears in the DTD outside of an -- entity value. xmlLexPESub :: String -> Maybe Posn -> String -> [XMLToken] xmlLexPESub name pos s | "INCLUDE" `prefixes` s = emit (TokSection INCLUDEx) p: k 7 | "IGNORE" `prefixes` s = emit (TokSection IGNOREx) p: k 6 | otherwise = blank xmlAny [] p s where k n = skip n p s xmlReLex p = posInNewCxt name pos ------------------------------------------------------------ -- | 'xmlLexEntitySub' is called by the parser when performing substitution -- in the body of an entity definition in the DTD. -- -- The name parameter is provided for type compatibility with xmlLexPESub, -- but is unused for entity values which always have a separately specified -- position at which they are defined. -- xmlLexEntitySub :: String -> Maybe Posn -> String -> [XMLToken] xmlLexEntitySub _name (Just p) s = xmlEntityContent "" p p s -- | 'xmlLexEntityValue' is called by the parser when it processes -- the entity value in an entity definition. -- -- It is essentially an alternative interface to xmlEntityContent logic, -- for which the position of the tokens is relative to the supplied -- position. -- xmlLexEntity :: Posn -> String -> [XMLToken] xmlLexEntity p s = xmlEntityContent "" p p s -- Tokenize entity content, which is similar to string content (see below), -- except that the entity content may contain parameter entity references. -- -- NOTE: it is assumed that XML character validity checking has already been -- performed on the supplied string. -- xmlEntityContent :: String -> Posn -> Posn -> String -> [XMLToken] xmlEntityContent "" pos _ [] = [] xmlEntityContent acc pos _ [] = [emit (TokFreeText (reverse acc)) pos] xmlEntityContent acc pos p (s:ss) | s == '&' = xmlEntityRef acc pos (\p -> xmlEntityContent "" p p) p TokAmp ss | s == '%' = xmlEntityRef acc pos (\p -> xmlEntityContent "" p p) p TokPercent ss | isXmlSpaceChar s = xmlEntityContent (s:acc) pos (white s p) ss | otherwise = xmlEntityContent (s:acc) pos (addcol 1 p) ss ------------------------------------------------------------ -- | 'xmlLexString' is called by the parser when it processes an attribute -- or other string value that may contain entity references and free text, -- but no elements. xmlLexString :: Posn -> String -> [XMLToken] xmlLexString p s = xmlStringContent "" p p s -- Tokenize string content, which means (a) detecting entity references (&...;), -- and (b) disallowing '<' characters (that test might be omitted, since the XML -- requirement disallowing '<' in attributes seems to be a nod to certain styles -- of implementation). 'acc' is accumulated reverse text, and 'pos' is the -- position at which the 'acc' value appears. -- -- NOTE: it is assumed that XML character validity checking has already been -- performed on the supplied string. -- xmlStringContent :: String -> Posn -> Posn -> String -> [XMLToken] xmlStringContent "" pos _ [] = [] xmlStringContent acc pos _ [] = [emit (TokFreeText (reverse acc)) pos] xmlStringContent acc pos p (s:ss) | s == '<' = lexerror ("unexpected '<' in string") p | s == '&' = xmlEntityRef acc pos (\p -> xmlStringContent "" p p) p TokAmp ss | isXmlSpaceChar s = xmlStringContent (s:acc) pos (white s p) ss | otherwise = xmlStringContent (s:acc) pos (addcol 1 p) ss {- Assuming this check not needed: | isXmlChar s = xmlStringContent (s:acc) pos (addcol 1 p) ss | otherwise = ("(xmlStringContent) illegal character") p -} ------------------------------------------------------------ -- Tokenize an entity reference -- (common code for xmlEntityContent and xmlStringContent) -- -- acc reverse accumulated text preceding entity reference -- pos position of acc text -- k continuation tokenizer following ';' -- p position of first token -- t initial token (TokAmp or TokPercent) -- ss remaining string to tokenize -- xmlEntityRef :: String -> Posn -> (Posn->String->[XMLToken]) -> Posn -> TokenT -> String -> [XMLToken] xmlEntityRef acc pos k p t ss = emit (TokFreeText (reverse acc)) pos : emit t p : accumulateUntil ";" TokSemi "" p' p' ss k where p' = (addcol 1 p) ------------------------------------------------------------ -- Main tokenizer xmlPI w p s = xmlName p s "name of processor in " (blank xmlPIEnd w) xmlPIEnd w p s = accumulateUntil "?>" TokPIClose "" p p s (blank xmlAny (tail w)) xmlComment w p s = accumulateUntil "-->" TokCommentClose "" p p s (blank xmlAny w) -- Note: the order of the clauses in xmlAny is very important. -- Some matches must precede the NotInTag test, the rest must follow it. xmlAny :: [Where] -> Posn -> String -> [XMLToken] xmlAny (InTag t:_) p [] = lexerror ("unexpected EOF within "++t) p xmlAny _ p [] = [] xmlAny w p s@('<':ss) | "?" `prefixes` ss = emit TokPIOpen p: skip 2 p s (xmlPI (InTag "":w)) | "!--" `prefixes` ss = emit TokCommentOpen p: skip 4 p s (xmlComment w) | "![" `prefixes` ss = emit TokSectionOpen p: skip 3 p s (xmlSection w) | "!" `prefixes` ss = emit TokSpecialOpen p: skip 2 p s (xmlSpecial (InTag "":w)) | "/" `prefixes` ss = emit TokEndOpen p: skip 2 p s (xmlTag (InTag "":tail w)) | otherwise = emit TokAnyOpen p: skip 1 p s (xmlTag (InTag "<...>":NotInTag:w)) xmlAny w p ('&':ss) = emit TokAmp p : accumulateUntil ";" TokSemi "" p (addcol 1 p) ss (xmlAny w) xmlAny w@(NotInTag:_) p s = xmlContent "" w p p s -- Match "/>" only when it occurs in a tag in the document body: xmlAny (_:NotInTag:w) p s@('/':'>':ss) = emit TokEndClose p: skip 2 p s (xmlAny w) -- The following tokens are recognized only when they appear in a tag -- i.e. not in an entity reference or free text. -- (They are also recognized at document top level; -- i.e. outside both the prolog and the document root element. -- This might be the body of an external parameter entity.) xmlAny w p ('>':ss) = emit TokAnyClose p: xmlAny (tail w) (addcol 1 p) ss xmlAny w p ('[':ss) = emit TokSqOpen p: blank xmlAny (InTag "[...]":w) (addcol 1 p) ss xmlAny w p (']':ss) | "]>" `prefixes` ss = emit TokSectionClose p: skip 3 p (']':ss) (xmlAny (tail w)) | otherwise = emit TokSqClose p: blank xmlAny (tail w) (addcol 1 p) ss xmlAny w p ('(':ss) = emit TokBraOpen p: blank xmlAny (InTag "(...)":w) (addcol 1 p) ss xmlAny w p (')':ss) = emit TokBraClose p: blank xmlAny (tail w) (addcol 1 p) ss xmlAny w p ('=':ss) = emit TokEqual p: blank xmlAny w (addcol 1 p) ss xmlAny w p ('*':ss) = emit TokStar p: blank xmlAny w (addcol 1 p) ss xmlAny w p ('+':ss) = emit TokPlus p: blank xmlAny w (addcol 1 p) ss xmlAny w p ('?':ss) = emit TokQuery p: blank xmlAny w (addcol 1 p) ss xmlAny w p ('|':ss) = emit TokPipe p: blank xmlAny w (addcol 1 p) ss xmlAny w p ('%':ss) = emit TokPercent p: blank xmlAny w (addcol 1 p) ss xmlAny w p (';':ss) = emit TokSemi p: blank xmlAny w (addcol 1 p) ss xmlAny w p (',':ss) = emit TokComma p: blank xmlAny w (addcol 1 p) ss xmlAny w p ('#':ss) = emit TokHash p: blank xmlAny w (addcol 1 p) ss xmlAny w p ('"':ss) = emit TokQuote p: accumulateUntil "\"" TokQuote "" p1 p1 ss (xmlAny w) where p1 = addcol 1 p xmlAny w p ('\'':ss) = emit TokQuote p: accumulateUntil "'" TokQuote "" p1 p1 ss (xmlAny w) where p1 = addcol 1 p xmlAny w p s@(s1:_) | isXmlNameChar s1 = xmlName p s "some kind of name" (blank xmlAny w) | isXmlSpaceChar s1 = blank xmlAny w p s | isXmlChar s1 = lexerror ("unrecognised token: "++take 4 s) p | otherwise = lexerror ("(xmlAny) illegal character") p xmlTag w p s = xmlName p s "tagname for element in < >" (blank xmlAny w) xmlSection = blank xmlSection0 where xmlSection0 w p s | "CDATA[" `prefixes` s = emit (TokSection CDATAx) p: accum w p s 6 | "INCLUDE" `prefixes` s = emit (TokSection INCLUDEx) p: k w p s 7 | "IGNORE" `prefixes` s = emit (TokSection IGNOREx) p: k w p s 6 | "%" `prefixes` s = emit TokPercent p: k w p s 1 | otherwise = lexerror ("expected CDATA, IGNORE, or INCLUDE") p accum w p s n = let p0 = addcol n p in accumulateUntil "]]>" TokSectionClose "" p0 p0 (drop n s) (blank xmlAny w) k w p s n = skip n p s (xmlAny w) xmlSpecial w p s | "DOCTYPE" `prefixes` s = emit (TokSpecial DOCTYPEx) p: k 7 | "ELEMENT" `prefixes` s = emit (TokSpecial ELEMENTx) p: k 7 | "ATTLIST" `prefixes` s = emit (TokSpecial ATTLISTx) p: k 7 | "ENTITY" `prefixes` s = emit (TokSpecial ENTITYx) p: k 6 | "NOTATION" `prefixes` s = emit (TokSpecial NOTATIONx) p: k 8 | otherwise = lexerror "expected DOCTYPE, ELEMENT, ENTITY, ATTLIST, or NOTATION" p where k n = skip n p s (blank xmlAny w) -- NOTE: this matches XML 'Name' or 'Nmtoken' productions -- To confirm that a name is matched, the first character -- of the token returned must also satisfy isXmlNameStartChar. -- xmlName p (s:ss) cxt k | isXmlNameChar s = gatherName (s:[]) p (addcol 1 p) ss k | otherwise = lexerror ("expected a "++cxt++", but got char "++show s) p where gatherName acc pos p [] k = emit (TokName (reverse acc)) pos: k p [] -- lexerror ("unexpected EOF in name at "++show pos) p gatherName acc pos p (s:ss) k | isXmlNameChar s = gatherName (s:acc) pos (addcol 1 p) ss k | otherwise = emit (TokName (reverse acc)) pos: k p (s:ss) xmlContent acc w pos p [] = -- alternative: -- [emit (TokFreeText (reverse acc)) pos] -- Original: if all isXmlSpaceChar acc then [] else [emit (TokFreeText (reverse acc)) pos] -- else lexerror "unexpected EOF between tags" p xmlContent acc w pos p (s:ss) | elem s "<&" = -- alternative: -- emit (TokFreeText (reverse acc)) pos -- : xmlAny w p (s:ss) -- Original: if all isXmlSpaceChar acc then xmlAny w p (s:ss) else emit (TokFreeText (reverse acc)) pos : xmlAny w p (s:ss) | isXmlSpaceChar s = xmlContent (s:acc) w pos (white s p) ss | isXmlChar s = xmlContent (s:acc) w pos (addcol 1 p) ss | otherwise = lexerror ("(xmlContent) illegal character") p -------------------------------------------------------------------------------- -- $Log: Lex.hs,v $ -- Revision 1.20 2004/06/24 17:48:36 graham -- Include document filename/URI in parsed document prolog, -- for subsequent use as a base URI. -- -- Revision 1.19 2004/06/24 15:06:40 graham -- Prune some dead code from Parse and Lex modules. -- -- Revision 1.18 2004/06/24 14:06:57 graham -- Rearranged various lexing functions to be slightly less obscure in their usage. -- Factored out common code from entity value and attribute value parsing as -- a new function 'parseString'. -- -- Revision 1.17 2004/06/22 15:44:34 graham -- Updated parser to combine adjacent and eliminate blank free text entries -- in element content and attribute values. -- -- Revision 1.16 2004/06/17 15:11:35 graham -- Pass test cases for general entity substitution in attribute values. -- -- Revision 1.15 2004/06/16 18:17:15 graham -- Parameter entity and lexical phases re-worked to better support -- general entity substitution. -- Passes all but two tricky GE substitution regression tests. -- -- Revision 1.14 2004/06/15 20:01:39 graham -- First steps of internal general entity substitution filter are working. -- Some of the parsing has been re-worked to support this. -- All regression tests still pass. -- -- Revision 1.13 2004/06/08 11:35:59 graham -- External paremeter entity substitution test passes. -- -- Revision 1.12 2004/06/04 21:59:13 graham -- Wortk-in-progress: creating intermediate filter to handle parameter -- entity replacement. Separated common features from parse module. -- Created new module based on simplified use of parsing utilities -- to dtect and substitute PEs. The result is a modifed token sequence -- passed to the main XML parser. -- -- Revision 1.11 2004/06/03 14:55:37 graham -- Re-arrange parameter entity handling to distinguish internal subset usage -- in the syntax, and to leave parameter entities un-substituted in the parse -- tree. Test case testXmlFormat21 changes as a result. -- -- Revision 1.10 2004/06/02 19:34:18 graham -- Various small XML conformance improvements. -- -- Revision 1.9 2004/06/02 13:49:18 graham -- Fixed Lex.hs to reject illegal XML characters. This also fixes some -- run-time failures occurring when documents containing -- formfeed -- characters are presented. -- -- Revision 1.8 2004/06/02 11:00:43 graham -- Fixed up some comments and code layout. -- -- Revision 1.7 2004/06/02 08:39:05 graham -- Re-worked handling of attribute values so that entitry references -- can be recognized. -- -- Revision 1.6 2004/05/28 15:28:16 graham -- Improved conformance with XML, per conformance tests. -- All but one of the xmltext/valid/sa tests now pass. -- There are still several xmltext/not-wf/sa tests that are not detected as -- incorrect XML, notably problems with attribute value handling. -- -- Revision 1.5 2004/05/28 10:47:48 graham -- Changed test harness to report error diagnostics on failure (foir debugging). -- Fixed lexing problem for names beginning with ':' and '_'. -- Two additional test cases (012,013) passed. -- -- Revision 1.4 2004/05/27 17:01:29 graham -- Updated HTML parser and tools to compile with revised XML parser, -- data types and combinators. The XML regression test still passes, but -- the other components are not currently tested. -- -- Revision 1.3 2004/05/26 14:06:19 graham -- Added parser function for case-insensitive word matching. -- Added document name/URI to parser state. -- All regression tests passed. -- -- Revision 1.2 2004/05/25 21:29:48 graham -- Refactored parser diagnostics handling. -- Added new type classes to isolate token details. -- All previous conformance tests still passed. -- -- Revision 1.1 2004/05/24 11:54:03 graham -- Add HaXml 1.12 to local CVS repository, prior to refactoring. -- Added CVS tags to source files to help track changes. --