"""
Split passed in file(s) into separate mail messages,
then re-order by date into single output file.
NB: the output file can be one of the input files;
input files that are links to ones already processed are ignored.
"""
Usage = """Usage: %s [--out <file>] [[--in] <file>] ...
--debug output debugging details at <level>
--in name of unordered mailbox
--links list linked files on <stdout>
--match restrict debugging and/or warning messages to just
those that match <regexp>
--out name of ordered mailbox
--verbose show actions
Default copies <stdin> to <stdout>.
"""
ShortOpts = 'd:i:lm:o:v?'
LongOpts = ['debug=', 'help', 'in=', 'links', 'match=', 'out=', 'verbose']
import getopt, md5, os, re, sys, time
from email.Parser import Parser
from email.Errors import MessageError
from email.Utils import mktime_tz, parsedate_tz
DebugLvl = 0
DebugMatch = None
ContLine = re.compile(r'\n\s+')
FileCache = {}
LastDate = mktime_tz(parsedate_tz('Sat, 1 Jan 2000 00:00:00 +1000'))
ListLinks = False
MessageCache = {}
Now = time.time()
UnixFrom = re.compile(r'^From \S+ .*(\n\s.*)*\n\S')
Verbose = False
MUA_Subject = "DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA"
def args():
try:
optlist, args = getopt.getopt(sys.argv[1:], ShortOpts, LongOpts)
except getopt.error, val:
usage(val)
global DebugLvl
global DebugMatch
global ListLinks
global Verbose
inf, ouf = [], None
for opt,val in optlist:
if opt in ('-d', '--debug'):
DebugLvl = int(val)
Verbose = True
elif opt in ('-i', '--in'):
inf += [val]
elif opt in ('-l', '--links'):
ListLinks = True
elif opt in ('-m', '--match'):
DebugMatch = re.compile(val)
elif opt in ('-o', '--out'):
ouf = val
elif opt in ('-v', '--verbose'):
Verbose = True
else:
usage()
for arg in args:
inf += [arg]
if not inf: inf += [None]
return inf, ouf
def print_links():
for id,item in FileCache.items():
names = item['names']
if len(names) > 1:
print ' '.join(names)
def process_file(name):
Debug(2, '''"Reading file %s" % (name or '<stdin>')''')
try:
if name is None:
name = '<stdin>'
fd = sys.stdin
else:
fd = open(name)
messages = []
r = os.fstat(fd.fileno())
id = (r.st_dev, r.st_ino)
if id not in FileCache:
FileCache[id] = {'times':(r.st_atime, r.st_mtime), 'names':[name]}
n = 0
for message in read_messages(fd):
mesg = process_message(message, n, name)
n += 1
if mesg is None:
continue
messages.append(mesg)
else:
FileCache[id]['names'] += [name]
Debug(2, '''"File %s is link to %s (already processed)" % (name, FileCache[id]['names'][0])''')
fd.close()
except (IOError, OSError), val:
error('Unable to read "%s" - %s.' % (name, val))
Debug(1, '''"File %s => %s messages" % (name, len(messages))''')
return messages
def process_message(text, msgnum, name):
global LastDate
Debug(3, r'''"Message %s \"%s\"..." % (msgnum, text[:79].replace('\n', '\\n'))''')
unixfrom, text = splitunixfrom(text.rstrip())
body = text[text.find('\n\n')+2:]
tag = md5.new(body).digest()
if tag in MessageCache:
Debug(2, '''"Message %s discarded: MD5 sum in cache" % msgnum''')
return None
MessageCache[tag] = None
Debug(3, '''"Message %s text size %s tag %s" % (msgnum, len(body), `tag`)''')
try:
msg = Parser().parsestr(text, headersonly=True)
except MessageError, val:
warn("File %s message %s parse error: %s" % (name, msgnum, str(val)))
return LastDate, msgnum, unixfrom + text + '\n\n'
subj = msg['Subject']
Debug(3, r'''"Subject: %s" % subj''')
if subj == MUA_Subject:
Debug(2, '''"Message %s discarded: Subject: %s" % (msgnum, MUA_Subject)''')
return None
date = msg['Date']
if not date:
date = LastDate
else:
try:
date = mktime_tz(parsedate_tz(date))
if date >= Now:
date = LastDate
else:
LastDate = date
except:
warn("File %s message %s parse error for {Date: %s}" % (name, msgnum, msg['Date']))
date = LastDate
Debug(2, '''"Found message %s: date=%s" % (msgnum, date)''')
return date, msgnum, unixfrom + text + '\n\n'
def read_messages(fd):
data = []; app = data.append
unixfrom = ''
for line in fd:
if unixfrom:
if line[0].isspace() and line[0] != '\n':
unixfrom += line
continue
if data and UnixFrom.match(unixfrom+line) is not None:
yield ''.join(data)
data[:] = [unixfrom]
unixfrom = ''
else:
app(unixfrom)
unixfrom = ''
if line[:5] == 'From ':
unixfrom = line
continue
app(line)
if data:
yield ''.join(data)
def sort_messages(files):
messages = []
for file in files:
messages += process_file(file)
if messages:
messages.sort()
global LastDate; LastDate = messages[-1][0]
Debug(1, '''"last date = %r" % LastDate''')
return [text for date,number,text in messages]
def splitunixfrom(text):
mo = UnixFrom.match(text)
if mo is None:
return '', text
start,stop = mo.span()
stop -= 1
unixfrom, text = text[start:stop], text[stop:]
return ' '.join(ContLine.split(unixfrom)), text
def Debug(lvl, str):
if DebugLvl < lvl:
return
pad = ''
try:
raise "get caller's frame"
except:
cf = sys.exc_info()[2].tb_frame.f_back
try:
pad = _frame_name(cf)
if str:
str = eval(str, cf.f_globals, cf.f_locals)
except:
if DebugLvl > 9:
import traceback
traceback.print_exc()
del cf
warn("%-*s %s" % (35+lvl, pad, str))
def _frame_name(frm, sep=os.sep):
code = frm.f_code
filename = code.co_filename
filename = filename[filename.rfind(sep)+1:]
self = frm.f_locals.get('self')
if self is None:
return '%s:%s' % (filename, code.co_name)
return '%s:%s.%s' % (filename, self.__class__.__name__, code.co_name)
def error(reason):
sys.stderr.write('%s\n' % reason)
sys.exit(1)
def report(message):
sys.stdout.write('%s\n' % message)
sys.stdout.flush()
def usage(reason=''):
sys.stdout.flush()
if reason: sys.stderr.write('\t%s\n\n' % reason)
head, tail = os.path.split(sys.argv[0])
sys.stderr.write(Usage % tail)
sys.stderr.write(__doc__)
sys.exit(1)
def warn(msg):
if DebugMatch is not None and DebugMatch.search(msg) is None:
return
sys.stdout.flush()
sys.stderr.write('%s\n' % msg)
sys.stderr.flush()
def main():
in_files, out_file = args()
messages = sort_messages(in_files)
if not messages:
if Verbose:
report('No messages found in %r.' % in_files)
sys.exit(0)
try:
Debug(1, '''"Writing %s messages to %s" % (len(messages), out_file or '<stdout>')''')
if not out_file:
fd = sys.stdout
else:
fd = open(out_file, "w")
fd.write('\n\n'.join(messages))
fd.close()
if out_file and LastDate:
os.utime(out_file, (LastDate, LastDate))
except IOError, val:
error('Could not write "%s": %s' % (out_file or '<stdout>', str(val)))
if out_file and ListLinks:
print_links()
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass