-- ------------------------------------------------------------ -- -- input functions -- implemented as filer -- -- Version : $Id: XmlInput.hs,v 1.37 2003/09/25 16:08:36 hxml Exp $ module XmlInput ( getXmlContents , getXmlEntityContents , getUrlContents , getContentLength , guessDocEncoding , runInLocalURIContext , getBaseURI , setBaseURI , getAbsolutURI , isStandaloneDocument ) where import XmlTree import XmlState import XmlParser ( parseXmlDocEncodingSpec , parseXmlEntityEncodingSpec ) import Unicode ( getEncodingFct ) import XmlOutput ( traceTree , traceSource , traceMsg ) import Util ( stringTrim , stringToUpper ) import Parsec ( Parser , parse , anyChar , char , digit , getInput , many , many1 , manyTill , noneOf , oneOf , spaces , string , (<|>) ) import qualified Parsec ( try ) import POpen ( popen ) import Char ( toLower ) -- ------------------------------------------------------------ -- -- http modules import Network.HTTP -- http modules -- [GK] import Network.Browser -- [GK] -- ------------------------------------------------------------ -- -- file io and network import IO import Directory ( doesFileExist , getPermissions , getCurrentDirectory , readable ) import Network.URI ( URI , parseURI , relativeTo , scheme , path , escapeChar, isUnescapedInURI ) import Network.Socket ( withSocketsDo ) -- ------------------------------------------------------------ -- -- utilities import Maybe -- ------------------------------------------------------------ -- -- filter for reading the content -- input: a root node with arguments in tag attribute list -- and without any children. -- attribute a_source ("source") for input file or uri -- -- attribute a_encoding ("encoding") for encoding scheme -- -- attribute a_proxy ("proxy") for proxy tob be used in http access getXmlContents' :: XmlFilter -> XmlStateFilter a getXmlContents' parseEncodingSpec = isRoot `guardsM` getContent' where getContent' t' = ( liftM (setStatus c_ok "in getXmlContents") .>> getUrlContents .>> liftF parseEncodingSpec .>> guessDocEncoding .>> traceMsg 1 ("getXmlContents: content read and decoded from " ++ show input') .>> traceTree .>> traceSource ) t' where input' = valueOf a_source t' -- | -- filter for reading the content of a XML document -- -- input is a root node with the source as an attibute -- the text is read, the encoding scheme is parsed and selected -- and the input is translated into the internal UTF-8 string representation getXmlContents :: XmlStateFilter a getXmlContents = getXmlContents' parseXmlDocEncodingSpec .>> setBaseURIFilter .>> setStandAloneFilter -- | -- filter for reading the content of an external XML entity -- -- -- see also : 'getXmlContents' getXmlEntityContents :: XmlStateFilter a getXmlEntityContents = getXmlContents' parseXmlEntityEncodingSpec .>> setBaseURIFilter -- ------------------------------------------------------------ setBaseURIFilter :: XmlStateFilter a setBaseURIFilter = isRoot `guardsM` performAction (\ t -> setBaseURI (valueOf transferURI t)) -- | -- filter command for saving and restoring -- the base URI -- -- * 1.parameter f : the filter that possible changes the base URI -- -- -- - returns : a filter with the same effect as f, that restores the base URI after application of f runInLocalURIContext :: XmlStateFilter a -> XmlStateFilter a runInLocalURIContext f t = do oldContext <- getBaseURI trace 2 ("runInLocalURIContext: save base URI " ++ show oldContext) res <- f t setBaseURI oldContext trace 2 ("runInLocalURIContext: restore base URI " ++ show oldContext) return res -- ------------------------------------------------------------ -- | -- guessEncoding uses encoding attribute and content -- to determine the encoding scheme. -- -- it's assumed that an encoding spec has been tried to parse before guessing the encoding. -- -- UTF-8 is the default encoding -- -- other supported encodings are ISO-8859-1 (also known as ISO-Latin-1), -- US-ASCII, UTF-16 or ISO-10646-UCS-2, UTF-16BE, UTF-16LE guessDocEncoding :: XmlStateFilter a guessDocEncoding = isRoot `guardsM` addDocEncoding where addDocEncoding n' = do trace 2 ( "guessDocEncoding: encoding is " ++ show guess) ( encFilter (getEncodingFct guess) .>> liftM (addAttr transferEncoding guess) ) n' where guess :: String guess = head . filter (not . null) $ [ (guessEncoding . showXText . getChildren) n' , valueOf transferEncoding n' , valueOf a_encoding n' , "UTF-8" ] encFilter (Just fct) = liftM (processChildren (modifyText (normalizeNL . fct))) encFilter Nothing = addFatal ("encoding scheme not supported: " ++ show guess) -- ------------------------------------------------------------ -- -- White Space (2.3) -- end of line handling (2.11) -- \#x0D and \#x0D\#x0A are mapped to \#x0A normalizeNL :: String -> String normalizeNL ('\r' : '\n' : rest) = '\n' : normalizeNL rest normalizeNL ('\r' : rest) = '\n' : normalizeNL rest normalizeNL (c : rest) = c : normalizeNL rest normalizeNL [] = [] -- ------------------------------------------------------------ {- original: setDefaultURI :: XState state () setDefaultURI = do wd <- io getCurrentDirectory setSysParam transferDefaultURI ("file://localhost" ++ wd ++ "/") -} -- Revised version to allow Windows directory strings. [[[GK]]] -- -- If the current directory starts with 'd:', it is assumed to be a Windows -- directory, and all '\' characters are mapped to '/'. -- -- In any case, any non-URI or reserved character is escaped. setDefaultURI :: XState state () setDefaultURI = do wd <- io getCurrentDirectory let wd1 = case wd of d:':':_ | driveLetter d -> '/':concatMap win32ToUriChar wd otherwise -> concatMap escapeNonUriChar wd setSysParam transferDefaultURI ("file://localhost" ++ wd1 ++ "/") -- [[[I'd prefer to leave 'localhost' as null, but this raises -- awkward questions about whether it's OK to remove a -- null authority from a URI]]] where win32ToUriChar '\\' = "/" win32ToUriChar c = escapeNonUriChar c escapeNonUriChar c = escapeChar isUnescapedInURI c -- from Network.URI driveLetter d = d `elem` ['A'..'Z'] -- to test: -- run () $ do { uri <- getDefaultURI ; io (putStrLn uri) } {- Excerpt from: http://www.ietf.org/internet-drafts/draft-hoffman-rfc1738bis-01.txt 2.7 FILES The file URL scheme is used to designate files accessible on a particular host computer. This scheme, unlike most other URL schemes, does not designate a resource that is universally accessible over the Internet. A file URL takes the form: file:/// where is the fully qualified domain name of the system on which the is accessible, and is a hierarchical directory path of the form //.../. As a special case, can be the string "localhost" or the empty string; this is interpreted as "the machine from which the URL is being interpreted". However, this part of the syntax has been ignored on many systems. That is, for some systems, the following are considered equal, while on others they are not: file://localhost/path/to/file.txt file:///path/to/file.txt Some systems allow URLs to point to directories. In this case, there is usually (but not always) a terminating "/" character, such as in: file://usr/local/bin/ On systems running some versions of Microsoft Windows, the local drive specification is preceded by a "/" character. Thus, for a file called "example.ini" in the "windows" directory on the "c:" drive, the URL would be: file:///c:/windows/example.ini For Windows shares, there is an additional "/" prepended to the name. Thus, the file "example.doc" on the shared directory "department" would have the URL: file:////department/example.doc The file URL scheme is unusual in that it does not specify an Internet protocol or access method for such files; as such, its utility in network protocols between hosts is limited. -} getDefaultURI :: XState state String getDefaultURI = do uri <- getSysParam transferDefaultURI if null uri then do setDefaultURI getDefaultURI else return uri -- | -- set the base URI, all other URIs are handled relative to this base URI -- -- the default base URI is @file:\/\/localhost\/<current-working-dir>\/@ -- -- see also : 'getBaseURI' setBaseURI :: String -> XState state () setBaseURI str = do trace 2 ("setBaseURI: new base URI: " ++ show str) setSysParam transferURI str -- | -- read the current base URI -- -- see also : 'setBaseURI' getBaseURI :: XState state String getBaseURI = do uri <- getSysParam transferURI if null uri then do res <- getDefaultURI setBaseURI res getBaseURI else return uri -- | -- transform an URI into an absolut URI using the current base URI -- -- -- * 1.parameter uri : the URI as string -- -- -- - returns : the absolut URI as string or \"\" in case of an error getAbsolutURI :: String -> XState state String getAbsolutURI uri = do baseUri <- getBaseURI return $ expandURI uri baseUri expandURI :: String -> String -> String expandURI uri base = fromMaybe "" $ expand where expand = do base' <- parseURI base uri' <- parseURI uri abs' <- relativeTo uri' base' return $ show abs' -- ------------------------------------------------------------ setStandAloneFilter :: XmlStateFilter a setStandAloneFilter = performAction setStandAlone where setStandAlone t = do if null standalone then return () else do trace 2 ("setStandAloneFilter: standalone=" ++ show standaloneVal) setSysParam a_standalone standaloneVal where standalone = getValue a_standalone t standaloneVal = showXText standalone -- | -- predicate for testing the standalone document attribute isStandaloneDocument :: XState state Bool isStandaloneDocument = do val <- getSysParam a_standalone return (val == "yes") -- ------------------------------------------------------------ -- | -- the hard io operations -- -- for reading a file or accessing a document via http -- input must be a root node with a @source@ attribute specifying the URI getUrlContents :: XmlStateFilter a getUrlContents = isRoot `guardsM` getCont where getCont n' = do trace 1 ("getUrlContent: reading " ++ show src) uri <- getAbsolutURI src if null uri then urlErr ( "illegal URI for input: " ++ show src ) else let uri' = fromJust $ parseURI uri proto = scheme uri' handler = lookup proto protocolHandler handler' = fromMaybe getUnsupported handler in ( liftM (addAttr transferProtocol proto .> addAttr transferURI uri ) .>> handler' uri' ) $ n' where src = valueOf a_source n' urlErr msg = addFatal msg n' -- ------------------------------------------------------------ -- | -- compute the length of the data for a document read previously -- by a call of 'getUrlContents. The result is stored as an attribute -- value in the document root node. the attribute name is 'a_contentLength' getContentLength :: XmlFilter getContentLength = isRoot `guards` addAttrl contentLengthAttr where http_contentLength = httpPrefix ++ "Content-Length" contentLengthAttr :: XmlFilter contentLengthAttr t = choice [ hasAttr a_contentLength :-> none , hasAttr http_contentLength :-> mkXAttr a_contentLength (getValue http_contentLength) , this :-> mkXAttr a_contentLength getLength ] t getLength :: XmlFilter getLength t = xtext (show . length . xmlTreesToString . getChildren $ t) -- ------------------------------------------------------------ -- -- the table of potocol handlers -- looked up in getUrlContents protocolHandler :: [(String, URI -> XmlStateFilter a)] protocolHandler = [ ("file", getFileContents) , ("http", getHttpContents) ] -- -- the fall back protocol handler getUnsupported :: URI -> XmlStateFilter a getUnsupported uri = addFatal ( "unsupported protocol " ++ show (scheme uri) ++ " in URI: " ++ show uri ) -- ------------------------------------------------------------ -- -- attribute names for transfer protocol attributes transferPrefix , transferProtocol , transferMimeType , transferEncoding , transferURI , transferDefaultURI , transferStatus , transferMessage , transferVersion :: String transferPrefix = "transfer-" transferProtocol = transferPrefix ++ "Protocol" transferVersion = transferPrefix ++ "Version" transferMimeType = transferPrefix ++ "MimeType" transferEncoding = transferPrefix ++ "Encoding" transferDefaultURI = transferPrefix ++ "DefaultURI" transferStatus = transferPrefix ++ "Status" transferMessage = transferPrefix ++ "Message" transferURI = transferPrefix ++ "URI" httpPrefix :: String httpPrefix = "http-" -- ------------------------------------------------------------ -- -- the file protocol handler getFileContents :: URI -> XmlStateFilter a getFileContents uri n = do trace 2 ("getFileContent: reading file " ++ show source) exists <- io $ doesFileExist source if exists then do perm <- io $ getPermissions source if readable perm then do h <- io $ try ( openFile source ReadMode ) case h of Left e -> readErr ( "system error when reading file " ++ show source ++ ": " ++ ioeGetErrorString e ) Right h' -> do c <- io $ hGetContents h' return ( ( addAttrInt transferStatus 200 .> addAttr transferMessage "OK" .> replaceChildren (xtext c) ) n ) else readErr ("file " ++ show source ++ " not readable") else readErr ("file " ++ show source ++ " not found") where -- [[[GK]]] strip off leading '/' from Windows drive name source = fileuripath (path uri) fileuripath ('/':file@(d:':':more)) | driveLetter d = file fileuripath file = file readErr msg = addFatal msg n -- ------------------------------------------------------------ -- -- the http protocol handler switch for internal / external access getHttpContents :: URI -> XmlStateFilter a getHttpContents uri n = do curl <- getSysParamInt "withCurl" 0 ( if curl /= 0 then getHttpContentsWithCurl else getHttpContentsWithHttp ) uri n -- ------------------------------------------------------------ -- -- the http protocol handler, haskell implementation getHttpContentsWithHttp :: URI -> XmlStateFilter a getHttpContentsWithHttp uri n = do traceLevel <- getTraceLevel trace 2 ("getHttpContent: reading from URL " ++ show uri) proxy <- getSysParam a_proxy res <- io $ try (getHttp traceLevel uri proxy) case res of Left e -> readErr ( "http error when requesting URI " ++ show uri ++ ": " ++ ioeGetErrorString e ++ " (perhaps server does not understand HTTP/1.1) " ) Right response -> let al = convertResponseHeaders response cs = xtext (rspBody response) st = convertResponseStatus (rspCode response) in if st >= 200 && st < 300 then return $ (addAttrl (const al) .> replaceChildren cs) n else readErr ( "http error when accessing URI " ++ show (show uri) ++ ": " ++ show st ++ " " ++ rspReason response ) where readErr msg = addFatal msg n getHttp :: Int -> URI -> String -> IO Response getHttp trc' uri' proxy' = withSocketsDo $ browse ( do setOutHandler (trcFct) setErrHandler (trcFct) setProxy' proxy' (_ruri, rsp) <- request rq return rsp ) where trcFct s | trc' >= 5 = hPutStrLn stderr ("-- (" ++ show trc' ++ ") http: " ++ s) | otherwise = return () rq = defaultGETRequest uri' setProxy' "" = return () setProxy' p = setProxy (Proxy p Nothing) convertResponseStatus :: (Int, Int, Int) -> Int convertResponseStatus (a, b, c) = 100 * a + 10 * b + c convertResponseHeaders :: Response -> XmlTrees convertResponseHeaders r' = cvResponseCode (rspCode r') ++ cvResponseReason (rspReason r') ++ cvResponseHeaders (rspHeaders r') where cvResponseCode :: (Int, Int, Int) -> XmlTrees cvResponseCode st' = xattr transferStatus (show (convertResponseStatus st')) ++ xattr transferVersion httpVersion cvResponseReason :: String -> XmlTrees cvResponseReason r'' = xattr transferMessage (stringTrim r'') cvResponseHeaders :: [Header] -> XmlTrees cvResponseHeaders = concatMap cvResponseHeader cvResponseHeader :: Header -> XmlTrees cvResponseHeader (Header name value) | name == HdrContentType = ( case (parse parseContentType (show HdrContentType) value) of Right res -> res Left _ -> [] ) ++ addHttpAttr | otherwise = addHttpAttr where addHttpAttr = xattr (httpPrefix ++ (show name)) value -- ------------------------------------------------------------ -- -- the http protocol handler implemented by calling external program curl getHttpContentsWithCurl :: URI -> XmlStateFilter a getHttpContentsWithCurl uri n = do trace 2 ( "getHttpContentWithCurl: reading from URL " ++ show uri ) proxy <- getSysParam a_proxy curlOptions <- getSysParam "curlOptions" let allArgs = args ++ proxyArgs proxy ++ words curlOptions trace 4 ( "getHttpContentWithCurl: running " ++ show (unwords (cmd : allArgs)) ) (res, errs, pid) <- io $ popen cmd allArgs Nothing trace 4 ( "getHttpContentWithCurl: PID: " ++ show pid ) trace 4 ( "getHttpContentWithCurl: stdin: " ++ show res ) trace 4 ( "getHttpContentWithCurl: stderr: " ++ show errs ) if null res then addFatal ( "http error whenrequesting URI " ++ show uri ++ ": " ++ errs ) n else let (st, al, contents) = parseResponse res in liftM ( addAttrl (const al) .> replaceChildren (xtext contents) ) .>> ( if st >= 200 && st < 300 then thisM else addFatal ( "http error when accessing URI " ++ show uri ++ ": " ++ show st ++ " " ++ (valueOf transferMessage $ newRoot al) ) ) $ n where cmd = "curl" args = [ "--silent" , "--show-error" , "--dump-header", "-" , show uri ] proxyArgs "" = [] proxyArgs prx = [ "--proxy", prx ] parseResponse :: String -> (Int, XmlTrees, String) parseResponse inp = case (parse parseHttpResponse "HTTP Header" inp) of Right res -> res Left _ -> (999, xattr transferMessage "illegal HTTP response", inp) -- ------------------------------------------------------------ -- -- naming conflicts GHC.Extensions.try and Parsec.try tryP :: Parser a -> Parser a tryP = Parsec.try -- ------------------------------------------------------------ -- -- parsers for HTTP response -- and for mime type and encoding in content type header -- ------------------------------------------------------------ -- -- try to extract charset spec from Content-Type header -- e.g. "text/html; charset=ISO-8859-1" parseContentType :: Parser XmlTrees parseContentType = tryP ( do mimeType <- ( do mt <- many (noneOf ";") return (xattr transferMimeType mt) ) charset <- ( do char ';' many (oneOf " \t'") string "charset=" cs <- many1 anyChar return (xattr transferEncoding (stringToUpper cs)) ) return (mimeType ++ charset) ) <|> ( do mimeType <- many anyChar return (xattr transferMimeType mimeType) ) -- ------------------------------------------------------------ -- parseHttpResponse :: Parser (Int, XmlTrees, String) parseHttpResponse = do (rc, rh) <- parseResp rhs <- parseHeaders content <- getInput return (rc, rh ++ rhs, content) where crlf :: Parser () crlf = do ( tryP (string "\r\n") <|> string "\n" ) return () parseResp :: Parser (Int, XmlTrees) parseResp = do vers <- ( do http <- string "HTTP/" mav <- many1 digit char '.' miv <- many1 digit return (http ++ mav ++ "." ++ miv) ) spaces ds <- many1 digit spaces reason <- manyTill anyChar crlf return ( read ds, xattr transferMessage reason ++ xattr transferVersion vers ) parseHeaders :: Parser XmlTrees parseHeaders = ( do crlf return [] ) <|> ( do header1 <- parse1Header rest <- parseHeaders return $ header1 ++ rest ) <|> ( do return $ xattr (httpPrefix ++ "IllegalHeaders") "" ) parse1Header :: Parser XmlTrees parse1Header = do header <- manyTill anyChar (char ':') spaces value <- manyTill anyChar crlf let ct = parseCT header value return $ ct ++ xattr (httpPrefix ++ header) value where parseCT h v | map toLower h == "content-type" = ( case (parse parseContentType h v) of Right res -> res Left _ -> [] ) | otherwise = [] -- ------------------------------------------------------------ -- -- gues the encoding scheme by looking at the first few characters -- -- see XML Standard F.1 guessEncoding :: String -> String guessEncoding ('\xFF':'\xFE':'\x00':'\x00':_) = "UCS-4LE" -- with byte order mark guessEncoding ('\xFF':'\xFE':_) = "UTF-16LE" -- with byte order mark guessEncoding ('\xFE':'\xFF':'\x00':'\x00':_) = "UCS-4-3421" -- with byte order mark guessEncoding ('\xFE':'\xFF':_) = "UTF-16BE" -- with byte order mark guessEncoding ('\xEF':'\xBB':'\xBF':_) = "UTF-8" -- with byte order mark guessEncoding ('\x00':'\x00':'\xFE':'\xFF':_) = "UCS-4BE" -- with byte order mark guessEncoding ('\x00':'\x00':'\xFF':'\xFE':_) = "UCS-4-2143" -- with byte order mark guessEncoding ('\x00':'\x00':'\x00':'\x3C':_) = "UCS-4BE" -- "<" of "