#!python
#
# MailToRDF.py
#
"""
Class to process all Eudora mailbox files in a directory tree,
outputting the data as RDF/XML conforming to
<draft-klyne-message-rfc822-xml-01.txt>
"""

import os
import sys
import re
import ScanFiles
from mailbox   import UnixMailbox
from rfc822    import AddressList
from mimetools import Message
from multifile import MultiFile
from multifile import Error

class MailToRDF:

    # Construct instance to process a given tree to a named file
    def __init__( self, srcdir, outname ):
        if srcdir.isspace(): srcdir = "."
        self.srcdir = srcdir
        if outname:
            self.outfile = open( outname, 'w' )
        else:
            self.outfile = sys.stdout
        self.indent   = 0
        # Define some values once, to avoid repeated avaluation:
        self.SimpleAddressHeaders = (
            "return-path",
            "from",
            "sender",
            "resent-from",
            "resent-sender" )
        self.GroupAddressHeaders = (
            "to",
            "cc",
            "bcc",
            "reply-to",
            "resent-to",
            "resent-cc",
            "resent-bcc",
            "resent-reply-to" )
        self.SuppressHeaders = (
            "message-id",
            "mime-version",
            "content-type",
            "content-id",
            "content-transfer-encoding",
            "received",
            "list-unsubscribe",
            "list-subscribe",
            "delivery-date",
            "organization",
            "x-mimeole",
            "x-mime-autoconverted",
            "x-msmail-priority",
            "x-sender",
            "x-delivery-time",
            "x-accept-language",
            "x-attachments",
            "x-mailer",
            "x-message-flag",
            "x-eudora-signature",
            "x-envelope-from",
            "x-persona" )

    # Convert all mailbox files in a directory tree
    def ConvertTree( self ):
        self.headersseen = {}
        pattern = re.compile( r'^.+\.mbx$' )
        ScanFiles.ScanFiles( self.srcdir, pattern, self.ConvertMailbox )
        for h in self.headersseen.items():
            self.PutLine( "<Header>" )
            self.PutIndent( +2 )
            self.PutLine( "<name>%s</name>" % h[0] )
            self.PutLine( "<count>%d</count>" % h[1] )
            self.PutIndent( -2 )
            self.PutLine( "</Header>" )

    # Callback method to convert a single mailbox file
    def ConvertMailbox( self, srcdir, name ):
        srcname = os.path.join(srcdir, name)
        srcfile = open( srcname, 'r' )
        srcmail = UnixMailbox( srcfile )
        self.PutLine( "<Mailbox>" )
        self.PutIndent( +2 )
        self.PutLine( "<filename>%s</filename>" % srcname )
        msg = srcmail.next()
        while msg:
            self.PutLine( "<member>" )
            self.PutIndent( +2 )
            self.ConvertMailmsg( msg )
            self.PutIndent( -2 )
            self.PutLine( "</member>" )
            msg = srcmail.next()
        self.PutIndent( -2 )
        self.PutLine( "</Mailbox>" )
            
    # Method to convert a single RFC822 mail message
    def ConvertMailmsg( self, msg ):
        msgid = msg.getheader('message-id')
        if msgid: msgid = msgid[1:-1]           # strip off '<' and '>'
        self.PutMessageStart( msgid, None )
        for nam in msg.keys():
            list = msg.getheaders( nam )
            for hdr in list:
                self.PutHeader( nam, hdr )
        # Process message body
        self.ConvertBodyPart( msg )
        # End of message
        self.PutMessageEnd()

    # Method to write out the start of a new message
    def PutMessageStart( self, msgid, conid ):
        msgnamespaces = (
            "xmlns:msgxml='http://id.mimesweeper.com/iana/namespaces/email-xml/#'",
            "xmlns:rfc822='http://id.mimesweeper.com/iana/namespaces/rfc822/#'" )
        msgident      = "rdf:about='%s'"
        conident      = "msgxml:content='%s'"
        self.PutLine( "<msgxml:Message" )
        self.PutIndent( +4 )
        for n in msgnamespaces: self.PutLine( n )
        if msgid: self.PutLine( msgident%msgid )
        if conid: self.PutLine( conident%conid )
        self.PutLine( ">" )
        self.PutIndent( -2 )

    # Write message header
    def PutHeader( self, name, value ):
        if name in self.GroupAddressHeaders:
            self.PutHeaderAdrsList( name, value )
        elif name in self.SimpleAddressHeaders:
            self.PutHeaderAdrsSingle( name, value )
        elif name not in self.SuppressHeaders:
            if self.headersseen.has_key( name ):
                self.headersseen[name] += 1
            else:
                self.headersseen[name] = 1
            v = self.XMLEscape( value )
            self.PutHeaderValue( "rfc822", name, v )

    # Write address list header
    def PutHeaderAdrsList( self, name, value ):
        adrval = AddressList( value )
        for a in adrval:
            self.PutLine( "<rfc822:%s>" % name )
            self.PutIndent( +2 )
            self.PutHeaderAddressVal( a )
            self.PutIndent( -2 )
            self.PutLine( "</rfc822:%s>" % name )

    # Write single address header
    def PutHeaderAdrsSingle( self, name, value ):
        adrval = AddressList( value )
        self.PutLine( "<rfc822:%s>" % name )
        self.PutIndent( +2 )
        self.PutHeaderAddressVal( adrval[0] )
        self.PutIndent( -2 )
        self.PutLine( "</rfc822:%s>" % name )

    # Write single address value
    # adrval is a tuple (name,mailbox) or (group, '#group', adrlist)
    def PutHeaderAddressVal( self, adrval ):
        if len(adrval) == 3:
            # group address here
            self.PutLine( "<msgxml:Group>" )
            self.PutIndent( +2 )
            if adrval[0]:
                self.PutLine( "<msgxml:name>%s</msgxml:name>" % adrval[0] )
            for a in adrval[2]:
                self.PutLine( "<msgxml:member>" )
                self.PutIndent( +2 )
                self.PutHeaderAddressVal( a )
                self.PutIndent( -2 )
                self.PutLine( "</msgxml:member>" )
            self.PutIndent( -2 )
            self.PutLine( "</msgxml:Group>" )
        else:
            # simple address
            self.PutLine( "<msgxml:Address>" )
            self.PutIndent( +2 )
            if adrval[0]:
                self.PutLine( "<msgxml:name>%s</msgxml:name>" % adrval[0] )
            if adrval[1]:
                self.PutLine( "<msgxml:adrs>mailto:%s</msgxml:adrs>" % adrval[1] )
            self.PutIndent( -2 )
            self.PutLine( "</msgxml:Address>" )

    # Write end of message
    def PutMessageEnd( self ):
        self.PutIndent( -2 )
        self.PutLine( "</msgxml:Message>" )

    # Process body part and generate metadata as required
    # msg is message object with headers parsed
    def ConvertBodyPart( self, msg ):
        # bodypart start tag
        self.PutLine( "<msgxml:part>" )
        self.PutIndent( +2 )
        self.PutLine( "<msgxml:BodyPart>" )
        self.PutIndent( +2 )
        # extract content type, content id, ...
        contyp = msg.gettype()
        ctmain = msg.getmaintype()
        ctstr  = contyp
        for p in msg.getplist():
            if ( not re.match( "^boundary=", p ) and
                 not re.match( "^id=", p ) ):
                ctstr += ";" + p
        self.PutHeaderValue( "rfc822", "content-type", self.XMLEscape(ctstr) )
        cident = msg.getheader("content-id")
        if cident:
            self.PutHeaderValue( "rfc822", "content-id", self.XMLEscape(cident) )
        # dispatch body part processing on content type
        if contyp == "text/html":
            self.ProcessHTMLBody( msg )
        elif ctmain == "text":
            self.ProcessTextBody( msg )
        elif ctmain == "multipart":
            self.ProcessMultipart( msg )
        else:
            self.PutLine( "<msgxml:body-content-not-processed/>" )
        # bodypart end tag
        self.PutIndent( -2 )
        self.PutLine( "</msgxml:BodyPart>" )
        self.PutIndent( -2 )
        self.PutLine( "</msgxml:part>" )

    # Process multipart
    def ProcessMultipart( self, msg ):
        file = MultiFile( msg.fp )
        file.push( msg.getparam("boundary") )
        try:
            while file.next():
                submsg = mimetools.Message(file)
                self.ConvertBodyPart( submsg )
        except Error:
            # Eudora loses multipart structure and keeps just the text part
            msg.rewindbody()
            self.ProcessTextBody( msg )

    # Process HTML body part
    def ProcessHTMLBody( self, msg ):
        # !!! TBD
        return

    # Process text body part
    def ProcessTextBody( self, msg ):
        # !!! TBD
        return

    # Write simple message header
    def PutHeaderValue( self, namspc, name, value ):
        self.PutLine( "<%(s)s:%(n)s>%(v)s</%(s)s:%(n)s>"%
                      {'s':namspc,'n':name,'v':value} )

    # Write line of data
    def PutLine( self, line ):
        self.outfile.write( " "*self.indent + line + "\n" )
        
    # Change data indent
    def PutIndent( self, n ):
        self.indent += n

    # XML-escape element content string:
    # convert '<', '>' and '&' to character entities
    # global matchlt, matchgt, matchamp 
    matchlt  = re.compile( r'\<' )
    matchgt  = re.compile( r'\>' )
    matchamp = re.compile( r'\&' )
    def XMLEscape( self, value ):
        v1 = MailToRDF.matchamp.sub( '&amp;', value )
        v2 = MailToRDF.matchlt.sub( '&lt;', v1 )            
        v3 = MailToRDF.matchgt.sub( '&gt;', v2 )
        return v3

# End of MailToRDF

if __name__ == '__main__':
    dir = "D:/program files/Eudora/temp/"
    #dir = "D:/Dev/RDFHacking/"
    MailToRDF( dir, None ).ConvertTree()
