-- $Id: SubstitutePE.hs,v 1.15 2004/06/24 16:48:23 graham Exp $ -------------------------------------------------------------------------------- module Text.XML.HaXml.SubstitutePE ( subIntParamEntities , subExtParamEntities ) where import Text.XML.HaXml.EntityHelpers ( XMLTokens, mkXMLTokens, noXMLTokens , doParse, xsat, xnot, xtok , mkXmlToken, mkXmlError , getExtEntityText, stripXMLdecl , passXmlDecl ) import Text.XML.HaXml.Types ( SymTab, emptyST, addST, lookupST , EntityValue(..), EV(..) , ExternalID(..), PubidLiteral(..), SystemLiteral(..) , Reference(..), EntityRef, CharRef ) import Text.XML.HaXml.Lex ( xmlLexPESub , xmlLexEntitySub , xmlLexEntity , XMLToken , Posn(..) , TokenT(..) , Special(..) ) import Text.XML.HaXml.XParserUtils ( fst3, snd3, thd3 , XParserGen , maybe, either , posn , xmlitem, xmlpeek, xmlsat, xmltok, notxmltok , name, entName, string, freetext , word, wordci, nmtoken , readWith , externalid ) import Text.XML.HaXml.Unicode ( isXmlCharCode , isXmlPubidChar ) import Text.ParserCombinators.HuttonMeijerWallace ( Parser(..) , item, tryitem, eof , (+++), sat, tok, nottok, many, many1 , sepby, sepby1, chainl, chainl1, chainr, chainr1, ops, bracket , fatalerror, parseerror, elserror , stupd, stquery, stget ) import Numeric (readDec, readHex) import Prelude hiding (either,maybe,sequence) import qualified Prelude (either,maybe) ------------------------------------------------------------ -- Local parser type. -- -- Minimal parsing is used, sufficient to recognize the parameter -- entity definitions and references. Syntax error detection is -- left to the "real" parser. -- type XParser a = XParserGen XParserSt a -- Local parser state and parameter entities symbol table. -- -- Note that external entity resolution is handled via the symbol -- table, using the resolver function in the parser state value. -- See function 'pedef' below. -- (Hmmm... can this work in the IO monad, without using unsafePerformIO?) -- Thus, the symbol table effectively becomes a "memoization" of -- external parameter entities. -- (This looks like a reason that unsafePerformIO cannot be avoided.) -- data XParserSt = PS { docName :: String -- Document name or URI , peResolve :: String -> Posn -> ExternalID -> PEValue , peDefs :: PSymTab } type PEValue = (Posn,Either String String) type PSymTab = SymTab PEValue emptyState = PS "" peNullResolver emptyST peNullResolver :: String -> Posn -> ExternalID -> PEValue peNullResolver bas pos eid = (pos,Left "Reference to external parameter entity") peExtResolver :: String -> Posn -> ExternalID -> PEValue peExtResolver bas pos eid = (pos,getExtEntityText bas eid) addPE :: String -> PEValue -> XParserSt -> XParserSt addPE n v st = st { peDefs = addST n v (peDefs st) } ------------------------------------------------------------ -- |Function to substitute internal parameter entities in a document. -- -- (Note that the main document body tokens (dts) are passed through -- without processing.) -- subIntParamEntities :: String -> [XMLToken] -> [XMLToken] subIntParamEntities name ts = (xts . pts) dts where (xts,xtr) = passXmlDecl ts (pts,_,dts) = processDtd name initState xtr initState = PS name peNullResolver emptyST -- |Function to substitute internal and external parameter entities -- in a document. -- subExtParamEntities :: String -> [XMLToken] -> [XMLToken] subExtParamEntities name ts = (xts . pts) dts where (xts,xtr) = passXmlDecl ts (pts,_,dts) = processDtd name initState xtr initState = PS name peExtResolver emptyST ------------------------------------------------------------ -- Generic function to deal with Parameter Entity substitution -- -- namefn function to extract the entity name from a token: -- returns Just name or Nothing. -- petoks function to return a list of tokens corresponding to the -- parameter entity content -- -- These options vary depending upon whether the parameter entity -- reference is encountered at the DTD top level, or in an entity -- value in an entity declaration. (Cf. XML spec section 4.4.) -- substitutePE_gen :: (TokenT -> Maybe String) -> (String -> Maybe Posn -> String -> [XMLToken]) -> SymTab PEValue -> XMLTokens -> XMLTokens substitutePE_gen namefn petoks symtab ts = sub1 [] (ts []) where sub1 acc (t1@(_,Right TokPercent):ts) = sub2 acc t1 ts sub1 acc (t1:ts) = sub1 (t1:acc) ts sub1 acc [] = mkXMLTokens $ reverse acc sub2 acc t1 ts@((_,Right tok):(_,Right TokSemi):tr) = sub3 acc t1 (namefn tok) tr sub2 acc t1 ts = sube acc t1 ts "% not used for Parameter Entity reference" sub3 acc t1 Nothing ts = sube acc t1 ts "% no name for Parameter Entity reference" -- Parameter entity substitution -- new content is processed recursively (invocation of sub1) -- cf. XML spec section 4.5 sub3 acc t1 (Just nam) tr = case lookupST nam symtab of Just epv -> (mkXMLTokens $ reverse acc) . sub1 [] (lexpe nam epv $ tr) otherwise -> sube acc t1 tr ("Reference to undefined Parameter Entity: "++nam) sube acc t1@(p,_) ts err = sub1 (t1:(p,Left err):acc) ts lexpe nam (p,Right s) = mkXMLTokens (petoks nam (Just p) s) lexpe nam (p,Left e) = mkXMLTokens [(p,Left $ "%"++nam++"; "++e++";")] -- Function to deal with PE substitution in top-level DTD context -- substitutePE_dtd :: SymTab PEValue -> XMLTokens -> XMLTokens substitutePE_dtd = substitutePE_gen namefn xmlLexPESub where namefn (TokName nam) = Just nam namefn _ = Nothing ------------------------------------------------------------ -- Function to build PE symbol table from DTD, and substitute -- internal PEs that appear in the DTD content, returning the -- updated symbol table, subtsituted DTD token sequence and -- the input token sequence following the DTD. -- processDtd :: String -> XParserSt -> [XMLToken] -> (XMLTokens,PSymTab,[XMLToken]) processDtd name st ts = (dtdtoks,peDefs st',moretoks) where (dtdtoks,st',moretoks) = doParse dtd noXMLTokens st ts dtd :: XParser XMLTokens dtd = do { t0s <- misc0 ; t1 <- xtok TokSpecialOpen -- ' ; t4s <- markupdecls +++ (return noXMLTokens) -- [...] ; t5 <- tryitem ; xmltok TokAnyClose -- '>' ; return $ t0s . mkXMLTokens (t1:t2:t3s) . t4s . mkXMLTokens [t5] } markupdecls :: XParser XMLTokens markupdecls = do { t1 <- xtok TokSqOpen ; t2s <- many markupdecl ; t3 <- xtok TokSqClose ; return $ mkXMLTokens [t1] . foldr (.) id t2s . mkXMLTokens [t3] } markupdecl :: XParser XMLTokens markupdecl = peref +++ pedecl +++ gedecl +++ otherdecl +++ misc1 -- Parameter entity reference appearing as markup declaration separator: peref :: XParser XMLTokens peref = do { t1 <- xtok TokPercent ; t2 <- item ; t3 <- xtok TokSemi ; stb <- stquery peDefs ; let sub = substitutePE_dtd stb $ mkXMLTokens [t1,t2,t3] ; return sub } -- As PEs are added to the symbol table, they are removed from -- the token sequence. (Is this OK?) pedecl ::XParser XMLTokens pedecl = do { t1 <- xtok TokSpecialOpen ; t2 <- xtok (TokSpecial ENTITYx) ; t3 <- xtok TokPercent ; t4 <- tryitem ; n <- name ; e <- pedef ; t6 <- xtok TokAnyClose ; stupd (addPE n e) ; return id } pedef :: XParser PEValue pedef = ( do pos <- posn xmltok TokQuote xts <- entityvalue xmltok TokQuote return (pos,Right xts) ) +++ ( do pos <- posn eid <- externalid bas <- stquery docName res <- stquery peResolve return (res bas pos eid) ) -- GE declarations need special processing to replace parameter entity -- references in the entity value. This requires some re-processing -- of the entity value body. -- -- This parser matches only GE declarations with internally defined -- replacement text. External GE declarations are not recognized, -- and are picked up the the 'otherdecl' production (below). -- gedecl ::XParser XMLTokens gedecl = do { t1 <- xtok TokSpecialOpen ; t2 <- xtok (TokSpecial ENTITYx) ; t3 <- tryitem ; n <- name ; t4 <- xtok TokQuote ; p <- posn ; e <- entityvalue ; let t5 = mkXmlToken p (TokFreeText e) ; t6 <- xtok TokQuote ; t7 <- xtok TokAnyClose ; return $ mkXMLTokens [t1,t2,t3,t4,t5,t6,t7] } -- Entity value content is presented initially as a freetext value. -- Character and parameter entity substitution is required, with -- the result returned as a string value. -- -- Parameter entity substitution is applied before character -- entity substitution. -- Cf. XML spec section, appendix D, 2nd example. -- entityvalue :: XParser String entityvalue = do evpos <- posn evstr <- freetext stb <- stquery peDefs let lexev = mkXMLTokens (xmlLexEntity evpos evstr) return $ flattenEvToks $ substituteCE $ substitutePE_ent stb $ lexev -- Reconstruct a string from a sequence of tokens that can appear in an -- entity value after character references have been substituted. -- Tokens handled are those that can be generated by xmlLexEntity. flattenEvToks :: XMLTokens -> String flattenEvToks = concatMap flatten . ($[]) where flatten (_,Right (TokPercent)) = "%" flatten (_,Right (TokAmp)) = "&" flatten (_,Right (TokSemi)) = ";" flatten (_,Right (TokFreeText s)) = s flatten (_,Right tok) = "%unexpected! "++show tok++";" flatten (_,Left err) = "%error! err;" -- Function to deal with PE substitution in entity values -- substitutePE_ent :: SymTab PEValue -> XMLTokens -> XMLTokens substitutePE_ent = substitutePE_gen namefn xmlLexEntitySub where namefn (TokFreeText nam) = Just nam namefn _ = Nothing -- Function to deal with CE substitution in entity values -- substituteCE :: XMLTokens -> XMLTokens substituteCE ts = sub1 [] (ts []) where sub1 acc (t1@(_,Right TokAmp):ts) = sub2 acc t1 ts sub1 acc (t1:ts) = sub1 (t1:acc) ts sub1 acc [] = mkXMLTokens $ reverse acc sub2 acc t1@(pos,_) (t2@(_,Right (TokFreeText ent)):t3@(_,Right TokSemi):tr) = case ent of ('#':'x':i) -> readChar readHex i ('#':i) -> readChar readDec i otherwise -> emitToks [TokAmp,TokFreeText ent,TokSemi] where readChar f i = case readWith f i of (Just c) | isXmlCharCode c -> emitChar c otherwise -> emitCharErr ent emitChar c = emitAll [mkXmlToken pos (TokFreeText [(toEnum c)])] emitCharErr ent = emitAll [mkXmlError pos ("Invalid char ref: "++ent)] emitToks ts = emitAll (map (mkXmlToken pos) ts) emitAll ts = (mkXMLTokens $ reverse acc) . mkXMLTokens ts . sub1 [] tr sub2 acc t1 ts = sub1 (t1:acc) ts -- Any declaration in a DTD starts with '', -- and has no internal '>' tokens otherdecl :: XParser XMLTokens otherdecl = do { t1 <- xtok TokSpecialOpen ; t2s <- many (xnot (TokAnyClose==)) ; t3 <- xtok TokAnyClose ; stb <- stquery peDefs ; return $ substitutePE_dtd stb $ mkXMLTokens ((t1:t2s)++[t3]) } -- One or more comment/PI misc1 = do { t1 <- comment +++ xmlpi ; t2s <- misc0 ; return $ t1 . t2s } -- Zero or more comment/PI misc0 :: XParser XMLTokens misc0 = do { t1s <- many ( comment +++ xmlpi ) ; return $ foldr (.) id t1s } comment :: XParser XMLTokens comment = do { t1 <- xtok TokCommentOpen ; t2 <- item ; t3 <- xtok TokCommentClose ; return $ mkXMLTokens [t1,t2,t3] } xmlpi :: XParser XMLTokens xmlpi = do { t1 <- xtok TokPIOpen ; t2 <- item ; t3 <- item ; t4 <- xtok TokPIClose ; return $ mkXMLTokens [t1,t2,t3,t4] } -------------------------------------------------------------------------------- -- $Log: SubstitutePE.hs,v $ -- Revision 1.15 2004/06/24 16:48:23 graham -- Merged common code from Parse.cpphs and SubstitutePE.hs, -- moved to module XParserUtils.hs. -- -- Revision 1.14 2004/06/24 14:43:31 graham -- References to parameter entity handling removed from the parser, -- now -- this is handled by a separate module. -- -- Revision 1.13 2004/06/24 14:06:57 graham -- Rearranged various lexing functions to be slightly less obscure in their usage. -- Factored out common code from entity value and attribute value parsing as -- a new function 'parseString'. -- -- Revision 1.12 2004/06/17 17:20:37 graham -- Substitution of external general entities tested. -- -- Revision 1.11 2004/06/17 17:08:38 graham -- Refactored SubstitutePE.hs into SubstitutePE.hs and EntityHelpers.hs, -- so common functions can be shared between PE and GE substitution code. -- -- Revision 1.10 2004/06/17 15:11:35 graham -- Pass test cases for general entity substitution in attribute values. -- -- Revision 1.9 2004/06/17 11:40:43 graham -- Internal general entity substitution now passes all test cases. -- -- Revision 1.8 2004/06/16 18:17:15 graham -- Parameter entity and lexical phases re-worked to better support -- general entity substitution. -- Passes all but two tricky GE substitution regression tests. -- -- Revision 1.7 2004/06/15 20:01:39 graham -- First steps of internal general entity substitution filter are working. -- Some of the parsing has been re-worked to support this. -- All regression tests still pass. -- -- Revision 1.6 2004/06/09 11:01:14 graham -- Strip declaration when subsitutinf external entity. -- -- Revision 1.5 2004/06/08 11:35:59 graham -- External paremeter entity substitution test passes. -- -- Revision 1.4 2004/06/08 10:42:50 graham -- Parameter entity definition body submitted to full reLex when defined. -- -- Revision 1.3 2004/06/07 16:42:28 graham -- Substitution logic now compiles, but not yet built into PE handling code. -- Two non-well-formed test cases now fail. -- Not yet decided if they're important enough to fix. -- -- Revision 1.2 2004/06/07 12:29:00 graham -- SubstitutePE compiles. Substitution logic still missing. -- -- Revision 1.1 2004/06/04 21:59:13 graham -- Wortk-in-progress: creating intermediate filter to handle parameter -- entity replacement. Separated common features from parse module. -- Created new module based on simplified use of parsing utilities -- to dtect and substitute PEs. The result is a modifed token sequence -- passed to the main XML parser. --