-------------------------------------------------------------------------------- -- $Id: CSVParser.hs,v 1.4 2004/04/21 12:46:05 graham Exp $ -- -- Copyright (c) 2003, G. KLYNE. All rights reserved. -- See end of this file for licence information. -------------------------------------------------------------------------------- -- | -- Module : CSVParser -- Copyright : (c) 2003, Graham Klyne -- License : GPL V2 -- -- Maintainer : Graham Klyne -- Stability : provisional -- Portability : H98 -- -- This module scrapes an RDF graph from an annotated CSV string, as follows: -- @prefix pref: -- @rowtype qname (pref:local or ) -- @columns -- qname, qname, ... (pref:local or ) -- @coltypes -- qname, qname, ... (pref:local or or @resource or @plain - default @plain) -- @data -- xxx, xxx, ... (blanks not included in graph) -- : -- [@end] -- -- This is intended to be used as a tool for scraping RDF from spreadsheet -- or database data. The CSV format used is based on that exported by -- Microsoft Excel. The annotation can be generated separately from the -- CSV data, and prefixed before calling this routine. Alternatively, the -- annotation can be added to spreadsheet data before exporting it as CSV. -- -- -------------------------------------------------------------------------------- module CSVParser ( ParseResult , parseCSVfromString ) where import N3Parser ( ParseResult(..) , N3Parser , parseAllFromString , prefixname, localname, absUriRef , symbol ) import ParseCSV ( parseCSV ) import RDFGraph ( RDFTriple, RDFGraph, RDFLabel(..) , NamespaceMap , LookupFormula(..), Formula, FormulaMap , setArcs, getArcs, addArc, add, delete, extract, labels , setFormula , setNamespaces , emptyRDFGraph, toRDFGraph ) import GraphClass ( arcSubj, arcPred, arcObj, arc ) import LookupMap ( LookupEntryClass(..), LookupMap(..) , mapFind, mapReplace, mapReplaceOrAdd ) import Namespace ( Namespace(..) , makeNamespaceQName , ScopedName(..) , getScopePrefix, getScopeURI , getQName, getScopedNameURI , matchName , makeScopedName, makeQNameScopedName, makeUriScopedName , nullScopedName ) import Vocabulary ( namespaceRDF , namespaceRDFS , namespaceXSD , rdf_type , xsd_type, xsd_string, xsd_boolean ) import Network.URI ( nullUri, isUri ) import Parsec import ErrorM ( ErrorM(Error,Result) ) import Maybe ( isJust, fromMaybe ) import Monad ( unless ) -- {- import TraceHelpers ( trace, traceShow, traceVal ) -- -} ---------------------------------------------------------------------- -- Define parser state and helper functions ---------------------------------------------------------------------- -- CSV parser state data CSVState = CSVState { prefixUris :: NamespaceMap -- Namespace prefix mapping table , graphState :: RDFGraph -- Graph under construction , rowType :: Maybe ScopedName -- Type represented by row , colProps :: [Maybe ScopedName] -- Properties for columns , colTypes :: [Maybe ScopedName] -- Type of each column value , nodeGen :: Int -- blank node id generator } -- Default table of namespaces prefixTable = [ namespaceRDF , namespaceRDFS , namespaceXSD ] -- Functions to update CSVState vector (use with Parsec updateState) setPrefix :: String -> String -> CSVState -> CSVState setPrefix pre uri st = st { prefixUris=p' } where p' = mapReplaceOrAdd (Namespace pre uri) (prefixUris st) -- Return function to update graph in CSV parser state, -- using the supplied function of a graph -- (use returned function with Parsec updateState) updateGraph :: ( RDFGraph -> RDFGraph ) -> ( CSVState -> CSVState ) updateGraph f s = s { graphState = f (graphState s) } -- Set type represented by each row. -- If not specified, no type is recorded. setRowType :: Maybe ScopedName -> CSVState -> CSVState setRowType rtyp st = st { rowType = rtyp } -- Set property associated with each column -- Only columns for which properties are recorded are written to the graph setColProps :: [Maybe ScopedName] -> CSVState -> CSVState setColProps cprops st = st { colProps = cprops } -- Set literal type represented by each column. -- Use xsd:string for plain literal values. -- If no type is specified, the column value is a resource URI. setColTypes :: [Maybe ScopedName] -> CSVState -> CSVState setColTypes ctypes st = st { colTypes = ctypes } -- Lookup prefix in table and return URI or 'prefix:' mapPrefix :: NamespaceMap -> String -> String mapPrefix ps pre = mapFind (pre++":") pre ps -- Functions to access state: -- Map prefix to namespace getPrefixNs :: CSVState -> String -> Namespace getPrefixNs st pre = Namespace pre (mapPrefix (prefixUris st) pre) -- Allocate a new blank node newBlankNode :: CSVParser RDFLabel newBlankNode = do { s <- getState ; let n = (nodeGen s) + 1 ; setState ( s { nodeGen = n } ) ; return (Blank (show n)) } ---------------------------------------------------------------------- -- Define top-level parser function: -- accepts a string and returns a graph or error ---------------------------------------------------------------------- type CSVParser a = GenParser [String] CSVState a parseCSVfromString :: String -> String -> ParseResult parseCSVfromString file input = case parseAnyFromToks csvdocument Nothing file csvtoks of Left err -> Error err Right gr -> Result gr where csvtoks = parseCSV input -- Function to supply initial context and parse supplied term -- -- This is a generalized form of parser that works with an arbitrary token -- sequence, and returns an arbitrary value. -- It is based on parseAnyfromString from module N3Parser.hs. -- -- parser is parser to apply. -- base is base URI of the input, or Nothing to use default base value. -- file is the name of the file from which input is taken, or null. -- This is used for diagnostics. -- toks is the sequence of input tokens to be parsed. -- parseAnyFromToks :: GenParser tok CSVState a -> Maybe String -> String -> [tok] -> (Either String a) parseAnyFromToks parser base file input = let pmap = LookupMap prefixTable pstate = CSVState { graphState = emptyRDFGraph , prefixUris = pmap , rowType = Nothing , colProps = [] , colTypes = [] , nodeGen = 0 } result = runParser parser pstate file input in case result of Left err -> Left (show err) Right res -> Right res ---------------------------------------------------------------------- -- Token matching primitives ---------------------------------------------------------------------- -- Primitive token parser: returns next row if predicate is satisfied row_test :: ([String] -> Bool) -> CSVParser [String] row_test p = tokenPrim (showRow "") nextRow testRow where nextRow pos _ _ = incSourceLine pos 1 testRow row | p row = Just row | otherwise = Nothing -- Returns next row unconditionally row_next :: CSVParser [String] row_next = row_test (const True) -- Returns next row if blank row_blank :: CSVParser [String] row_blank = row_test null -- Returns next row if comment (i.e. starts with '#') row_comment :: CSVParser [String] row_comment = row_test isComment where isComment [] = True isComment ([]:cs) = isComment cs isComment ((c:cs):cols) = c == '#' -- Returns next row if blank row_data :: CSVParser [String] row_data = row_test isData where isData [] = True isData ([]:cols) = True isData ((c:cs):cols) = c /= '@' -- Directive token parser: matches directive and returns entire row row_directive :: String -> CSVParser [String] row_directive dir = row_test ((==dir) . head) -- Helper function to format a row as a string showRow lab row = lab ++ showRow1 row showRow1 (str:strs) = str ++ (concatMap (", "++) strs) showRow1 [] = "(empty row)" ---------------------------------------------------------------------- -- Syntax productions ---------------------------------------------------------------------- csvdocument :: CSVParser RDFGraph csvdocument = do { many directive -- ; s1 <- getState -- ; trace ("ColTypes: " ++ (show $ colTypes s1)) $ return () ; startData ; many rowData ; endData ; s <- getState -- ; trace ("Namespaces: " ++ (show $ prefixUris s)) $ return () ; return $ setNamespaces (prefixUris s) (graphState s) } directive :: CSVParser () directive = {- do { row <- row_blank ; return () } <|> -} do { row <- row_comment ; return () } <|> do { row <- row_directive "@prefix" ; ( defaultPrefix row <|> namedPrefix row ) } <|> do { row <- row_directive "@rowtype" ; muri <- makeQNameUri (row!!1) ; updateState $ setRowType muri } <|> do { row_directive "@columns" ; row <- row_next ; cpr <- sequence (map makeColProp row) ; updateState $ setColProps cpr } <|> do { row_directive "@coltypes" ; row <- row_next ; cty <- sequence (map makeColType row) ; updateState $ setColTypes cty } "Directive: @prefix, @rowtype, @columns or @coltypes" defaultPrefix :: [String] -> CSVParser () defaultPrefix row = do { unless ((row!!1) == ":") $ fail "" ; muri <- makeQNameUri (row!!2) ; case muri of Nothing -> fail ("URI expected for default @prefix ") Just u -> updateState $ setPrefix "" (getScopeURI u) } namedPrefix :: [String] -> CSVParser () namedPrefix row = do { let pref = (parseAllFromString prefName Nothing (row!!1)) ; pn <- case pref of Left _ -> fail "Prefix name expected" Right n -> return n ; muri <- makeQNameUri (row!!2) ; case muri of Nothing -> fail ("URI expected for @prefix "++pn++":") Just u -> updateState $ setPrefix pn (getScopeURI u) } startData :: CSVParser () startData = do { row_directive "@data" ; return () } "@data directive: start of data" rowData :: CSVParser () rowData = do { row <- row_data ; st <- getState ; let cpr = colProps st ; let cty = colTypes st ; unless (null row) $ processRowData (zip3 row cpr cty) } endData :: CSVParser () endData = do { row_directive "@end" ; return () } <|> eof "@end directive, or end of input" ---------------------------------------------------------------------- -- Auxiliary functions ---------------------------------------------------------------------- -- Match qname or URI and return corresponding ScopedName, -- taking account of the current namespace prefix list. -- makeQNameUri :: String -> CSVParser (Maybe ScopedName) makeQNameUri uri = do { let uref = parseAllFromString uriRef Nothing uri ; qn <- case uref of Left _ -> return Nothing Right ("",u,"") -> return $ Just (makeUriScopedName u) Right ("","",l) -> do { ns <- getNamespace "" -- defaultprefix ; return $ Just (ScopedName ns l) } Right (p,"",l) -> do { ns <- getNamespace p -- prefix p ; return $ Just (ScopedName ns l) } ; return qn } getNamespace :: String -> CSVParser Namespace getNamespace pref = do { st <- getState ; return (getPrefixNs st pref) } -- Match string to column property, being a qname or URI corresponding to -- a property URI, or @about if the column is to be used as a subject -- URI for generated statements. -- -- Result is a scoped name for a property URI -- or Nothing if the column is a subject or unused. -- -- Note: if ColProp and ColType are both Nothing, then column is used -- as subject URIs (specify @about and @resource respectively); -- otherwise, if ColProp is Nothing, the column is unused. -- makeColProp :: String -> CSVParser (Maybe ScopedName) makeColProp prop = case prop of "@about" -> return Nothing "" -> return Nothing otherwise -> do { muri <- makeQNameUri prop ; unless (isJust muri) (fail $ "@columns "++prop++": QName or expected") ; return muri } -- Match string to column type, being a qname or URI corresponding to -- a datatype URI, @resource (mapped to Nothing) for a resource value, -- or blank (mapped to xsd:string) for a plain literal. -- -- Result is a scoped name for a literal datatype URI -- (xsd:string for plain literals), or Nothing if the column -- contains resource URIs. -- makeColType :: String -> CSVParser (Maybe ScopedName) makeColType typ = case typ of "@resource" -> return Nothing "@string" -> return (Just xsd_string) "" -> return (Just xsd_string) otherwise -> do { muri <- makeQNameUri typ ; unless (isJust muri) (fail $ "@coltypes "++typ++": QName or expected") ; return muri } -- Given a row of data, including properties and object type information, -- add appropariate statements to the graph under construction -- processRowData :: [(String,Maybe ScopedName,Maybe ScopedName)] -> CSVParser () processRowData rowdata = do { (subj,props) <- scanColumns rowdata ; unless (null props) $ do { subjLabel <- case subj of Just s -> return s otherwise -> newBlankNode ; st <- getState ; props1 <- case rowType st of Nothing -> return props Just rt -> return $ (Res rdf_type,Res rt):props ; createRowStatements subjLabel props1 } } -- Pre-scan column data for a given row, -- isolate the row subject (if present) and discard entries with no data. -- Entry data is converted to RDFLabel values. -- -- val ,property ,type scanColumns :: [(String,Maybe ScopedName,Maybe ScopedName)] -> CSVParser (Maybe RDFLabel,[(RDFLabel,RDFLabel)]) scanColumns [] = return (Nothing,[]) scanColumns (col:cols) = do { (subj,props) <- scanColumns cols ; case col of ("",_,_) -> return (subj,props) (val,Nothing,Just _) -> return (subj,props) (val,Nothing,Nothing) -> -- valuri is row subject do { valuri <- makeQNameUri val ; case valuri of Just v -> return (Just $ Res v,props) otherwise -> return (subj,props) } (val,Just prop,Nothing) -> -- val uri is object do { valuri <- makeQNameUri val ; case valuri of Just v -> return (subj,(Res prop,Res v):props) otherwise -> return (subj,props) } (val,Just prop,Just typ) | typ == xsd_string -> -- val is plain literal return (subj,(Res prop,Lit val Nothing):props) (val,Just prop,typ) -> -- val is typed literal return (subj,(Res prop,Lit val typ):props) } -- Add statement to graph for each (usable) value in a row -- createRowStatements :: RDFLabel -> [(RDFLabel,RDFLabel)] -> CSVParser () createRowStatements _ [] = return () createRowStatements subj ((prop,obj):cols) = do { addStatement subj prop obj ; createRowStatements subj cols } -- Add statement to graph in CSV parser state -- addStatement :: RDFLabel -> RDFLabel -> RDFLabel -> CSVParser () addStatement s p o = updateState (updateGraph (addArc (arc s p o) )) ---------------------------------------------------------------------- -- Auxiliary string parsers, using elements of the N3 parser ---------------------------------------------------------------------- prefName :: N3Parser String prefName = do { n <- prefixname ; symbol ":" ; return n } uriRef :: N3Parser (String,String,String) uriRef = do { ns <- prefName ; local <- localname ; return (ns,"",local) } <|> do { string ":" ; local <- localname ; return ("","",local) } <|> do { u <- absUriRef ; return ("",u,"") } -------------------------------------------------------------------------------- -- -- Copyright (c) 2003, G. KLYNE. All rights reserved. -- -- This file is part of Swish. -- -- Swish is free software; you can redistribute it and/or modify -- it under the terms of the GNU General Public License as published by -- the Free Software Foundation; either version 2 of the License, or -- (at your option) any later version. -- -- Swish is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with Swish; if not, write to: -- The Free Software Foundation, Inc., -- 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -- -------------------------------------------------------------------------------- -- $Source: /file/cvsdev/HaskellRDF/CSVParser.hs,v $ -- $Author: graham $ -- $Revision: 1.4 $ -- $Log: CSVParser.hs,v $ -- Revision 1.4 2004/04/21 12:46:05 graham -- Add query function to support RDF-driven URI test suite runner. -- Export Swish graph reading function for test suite runner. -- Fix some problems with CSV parser. -- -- Revision 1.3 2004/04/20 14:50:12 graham -- Fix some bugs in the CSV parser -- -- Revision 1.2 2004/03/26 12:14:03 graham -- Modifications to N3 parser, mostly to support report description compilation. -- -- Revision 1.1 2004/03/10 16:01:04 graham -- Add CSV parser to Swish, for scraping RDF from exported -- spreadsheet and database files. --