#!python # # MailToRDF.py # """ Class to process all Eudora mailbox files in a directory tree, outputting the data as RDF/XML conforming to """ import os import sys import re import ScanFiles from mailbox import UnixMailbox from rfc822 import AddressList from mimetools import Message from multifile import MultiFile from multifile import Error class MailToRDF: # Construct instance to process a given tree to a named file def __init__( self, srcdir, outname ): if srcdir.isspace(): srcdir = "." self.srcdir = srcdir if outname: self.outfile = open( outname, 'w' ) else: self.outfile = sys.stdout self.indent = 0 # Define some values once, to avoid repeated avaluation: self.SimpleAddressHeaders = ( "return-path", "from", "sender", "resent-from", "resent-sender" ) self.GroupAddressHeaders = ( "to", "cc", "bcc", "reply-to", "resent-to", "resent-cc", "resent-bcc", "resent-reply-to" ) self.SuppressHeaders = ( "message-id", "mime-version", "content-type", "content-id", "content-transfer-encoding", "received", "list-unsubscribe", "list-subscribe", "delivery-date", "organization", "x-mimeole", "x-mime-autoconverted", "x-msmail-priority", "x-sender", "x-delivery-time", "x-accept-language", "x-attachments", "x-mailer", "x-message-flag", "x-eudora-signature", "x-envelope-from", "x-persona" ) # Convert all mailbox files in a directory tree def ConvertTree( self ): self.headersseen = {} pattern = re.compile( r'^.+\.mbx$' ) ScanFiles.ScanFiles( self.srcdir, pattern, self.ConvertMailbox ) for h in self.headersseen.items(): self.PutLine( "
" ) self.PutIndent( +2 ) self.PutLine( "%s" % h[0] ) self.PutLine( "%d" % h[1] ) self.PutIndent( -2 ) self.PutLine( "
" ) # Callback method to convert a single mailbox file def ConvertMailbox( self, srcdir, name ): srcname = os.path.join(srcdir, name) srcfile = open( srcname, 'r' ) srcmail = UnixMailbox( srcfile ) self.PutLine( "" ) self.PutIndent( +2 ) self.PutLine( "%s" % srcname ) msg = srcmail.next() while msg: self.PutLine( "" ) self.PutIndent( +2 ) self.ConvertMailmsg( msg ) self.PutIndent( -2 ) self.PutLine( "" ) msg = srcmail.next() self.PutIndent( -2 ) self.PutLine( "" ) # Method to convert a single RFC822 mail message def ConvertMailmsg( self, msg ): msgid = msg.getheader('message-id') if msgid: msgid = msgid[1:-1] # strip off '<' and '>' self.PutMessageStart( msgid, None ) for nam in msg.keys(): list = msg.getheaders( nam ) for hdr in list: self.PutHeader( nam, hdr ) # Process message body self.ConvertBodyPart( msg ) # End of message self.PutMessageEnd() # Method to write out the start of a new message def PutMessageStart( self, msgid, conid ): msgnamespaces = ( "xmlns:msgxml='http://id.mimesweeper.com/iana/namespaces/email-xml/#'", "xmlns:rfc822='http://id.mimesweeper.com/iana/namespaces/rfc822/#'" ) msgident = "rdf:about='%s'" conident = "msgxml:content='%s'" self.PutLine( "" ) self.PutIndent( -2 ) # Write message header def PutHeader( self, name, value ): if name in self.GroupAddressHeaders: self.PutHeaderAdrsList( name, value ) elif name in self.SimpleAddressHeaders: self.PutHeaderAdrsSingle( name, value ) elif name not in self.SuppressHeaders: if self.headersseen.has_key( name ): self.headersseen[name] += 1 else: self.headersseen[name] = 1 v = self.XMLEscape( value ) self.PutHeaderValue( "rfc822", name, v ) # Write address list header def PutHeaderAdrsList( self, name, value ): adrval = AddressList( value ) for a in adrval: self.PutLine( "" % name ) self.PutIndent( +2 ) self.PutHeaderAddressVal( a ) self.PutIndent( -2 ) self.PutLine( "" % name ) # Write single address header def PutHeaderAdrsSingle( self, name, value ): adrval = AddressList( value ) self.PutLine( "" % name ) self.PutIndent( +2 ) self.PutHeaderAddressVal( adrval[0] ) self.PutIndent( -2 ) self.PutLine( "" % name ) # Write single address value # adrval is a tuple (name,mailbox) or (group, '#group', adrlist) def PutHeaderAddressVal( self, adrval ): if len(adrval) == 3: # group address here self.PutLine( "" ) self.PutIndent( +2 ) if adrval[0]: self.PutLine( "%s" % adrval[0] ) for a in adrval[2]: self.PutLine( "" ) self.PutIndent( +2 ) self.PutHeaderAddressVal( a ) self.PutIndent( -2 ) self.PutLine( "" ) self.PutIndent( -2 ) self.PutLine( "" ) else: # simple address self.PutLine( "" ) self.PutIndent( +2 ) if adrval[0]: self.PutLine( "%s" % adrval[0] ) if adrval[1]: self.PutLine( "mailto:%s" % adrval[1] ) self.PutIndent( -2 ) self.PutLine( "" ) # Write end of message def PutMessageEnd( self ): self.PutIndent( -2 ) self.PutLine( "" ) # Process body part and generate metadata as required # msg is message object with headers parsed def ConvertBodyPart( self, msg ): # bodypart start tag self.PutLine( "" ) self.PutIndent( +2 ) self.PutLine( "" ) self.PutIndent( +2 ) # extract content type, content id, ... contyp = msg.gettype() ctmain = msg.getmaintype() ctstr = contyp for p in msg.getplist(): if ( not re.match( "^boundary=", p ) and not re.match( "^id=", p ) ): ctstr += ";" + p self.PutHeaderValue( "rfc822", "content-type", self.XMLEscape(ctstr) ) cident = msg.getheader("content-id") if cident: self.PutHeaderValue( "rfc822", "content-id", self.XMLEscape(cident) ) # dispatch body part processing on content type if contyp == "text/html": self.ProcessHTMLBody( msg ) elif ctmain == "text": self.ProcessTextBody( msg ) elif ctmain == "multipart": self.ProcessMultipart( msg ) else: self.PutLine( "" ) # bodypart end tag self.PutIndent( -2 ) self.PutLine( "" ) self.PutIndent( -2 ) self.PutLine( "" ) # Process multipart def ProcessMultipart( self, msg ): file = MultiFile( msg.fp ) file.push( msg.getparam("boundary") ) try: while file.next(): submsg = mimetools.Message(file) self.ConvertBodyPart( submsg ) except Error: # Eudora loses multipart structure and keeps just the text part msg.rewindbody() self.ProcessTextBody( msg ) # Process HTML body part def ProcessHTMLBody( self, msg ): # !!! TBD return # Process text body part def ProcessTextBody( self, msg ): # !!! TBD return # Write simple message header def PutHeaderValue( self, namspc, name, value ): self.PutLine( "<%(s)s:%(n)s>%(v)s"% {'s':namspc,'n':name,'v':value} ) # Write line of data def PutLine( self, line ): self.outfile.write( " "*self.indent + line + "\n" ) # Change data indent def PutIndent( self, n ): self.indent += n # XML-escape element content string: # convert '<', '>' and '&' to character entities # global matchlt, matchgt, matchamp matchlt = re.compile( r'\<' ) matchgt = re.compile( r'\>' ) matchamp = re.compile( r'\&' ) def XMLEscape( self, value ): v1 = MailToRDF.matchamp.sub( '&', value ) v2 = MailToRDF.matchlt.sub( '<', v1 ) v3 = MailToRDF.matchgt.sub( '>', v2 ) return v3 # End of MailToRDF if __name__ == '__main__': dir = "D:/program files/Eudora/temp/" #dir = "D:/Dev/RDFHacking/" MailToRDF( dir, None ).ConvertTree()