| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167 |
- #!/usr/bin/python
- #
- # email_stats.py
- #
- # Analytics on Email mbox files
- # - Time vs date plots of emails
- # - Recipients domains as a function of time
- #
- # Copyright (C) 2015 George C. Privon
- #
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- import mailbox
- import datetime
- import matplotlib.pyplot as plt
- import matplotlib.dates as mdates
- import argparse
- import re
- import numpy as np
- import cubehelix
- parser = argparse.ArgumentParser()
- parser.add_argument('mbox', help='Mailbox to analyze.')
- parser.add_argument('--plotroot', '-p', default='email_stats', action='store',
- help='Root name for output plots.')
- parser.add_argument('--title', '-t', default='',
- action='store', help='Plot title root.')
- parser.add_argument('--sendercolors', '-s', default=False, action='store',
- help='Comma separated list of search strings for the \
- sender field. Each will be displayed with a \
- different color.')
- args = parser.parse_args()
- pldata = {}
- if args.sendercolors:
- slist = args.sendercolors.split(',')
- nsend = len(slist)
- for item in slist:
- pldata[item] = []
- pldata['unknown'] = []
- domains = ['gmail',
- 'hotmail',
- 'aol']
- senders = {}
- a = mailbox.mbox(args.mbox)
- for msg in a:
- cid = None
- label = None
- if msg['date'] is not None:
- try:
- z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z')
- except ValueError:
- try:
- z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z')
- except ValueError:
- print("Skipping message from " + msg['date'])
- continue
- dateID = "{0:1d}-{1:2d}".format(z.year, z.month)
- if not(dateID in senders.keys()):
- senders[dateID] = np.zeros(len(domains) + 1)
- dmatch = False
- for i, domain in enumerate(domains):
- if not(msg['To'] is None) and re.search(domain, msg['To']):
- senders[dateID][i+1] += 1
- dmatch = True
- if not(dmatch):
- senders[dateID][0] += 1
- if args.sendercolors:
- for search in enumerate(slist):
- if re.search(search[1], msg['From'], re.IGNORECASE):
- cid, label = search
- pldata[label].append([z.date(), z.hour + z.minute/60.])
- break
- if cid is None:
- pldata['unknown'].append([z.date(), z.hour + z.minute/60.])
- # Email send times as a function of day
- plt.figure()
- plt.ylim([0, 24])
- plt.yticks(4*np.arange(7))
- plt.ylabel('Hour')
- plt.xlabel('Date')
- plt.minorticks_on()
- plt.title(args.title + ' - Email Send Times')
- scolor = cubehelix.cmap(startHue=240, endHue=-300,
- minSat=1, maxSat=2.5,
- minLight=.3, maxLight=.8,
- gamma=.9)
- if args.sendercolors:
- for plid in enumerate(slist):
- plt.plot_date(np.array(pldata[plid[1]])[:,0],
- np.array(pldata[plid[1]])[:,1],
- color=scolor(plid[0] / (nsend + 1)),
- marker='.',
- #tz=z.tzname(),
- label=plid[1],
- xdate=True)
- if len(pldata['unknown']) > 0:
- plt.plot_date(np.array(pldata['unknown'])[:,0],
- np.array(pldata['unknown'])[:,1],
- color=scolor(1),
- marker='.',
- #tz=z.tzname(),
- label='unknown',
- xdate=True)
- if args.sendercolors:
- plt.legend(loc='upper left', ncol=nsend + 1)
- if args.plotroot:
- plt.savefig(args.plotroot + '-send_times.png')
- else:
- plt.savefig('send_times.png')
- # Email destination domains as a function of month
- plt.figure()
- plt.ylabel('Emails sent to Domain')
- plt.xlabel('Year-Month')
- plt.minorticks_on()
- plt.title(args.title)
- months = list(senders.keys())
- months.sort()
- for j in range(len(domains) + 1):
- domainlist = []
- if j == 0:
- label = 'Other'
- else:
- label = domains[j - 1]
- for month in months:
- domainlist.append(senders[month][j])
- plt.plot(np.arange(len(months)),
- domainlist,
- label=label)
- plt.legend(frameon=False, loc='best')
- plt.xticks(np.arange(len(months)), months, rotation=90)
- plt.setp(plt.axes().get_xticklabels(), visible=False)
- plt.setp(plt.axes().get_xticklabels()[::6], visible=True)
- if args.plotroot:
- plt.savefig(args.plotroot + '-destinations.png', bbox_inches='tight')
- else:
- plt.savefig('email_destinations.png', bbox_inches='tight')
|