| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210 |
- #!/usr/bin/python
- #
- # email_stats.py
- #
- # Analytics on Email mbox files
- # - Time vs date plots of emails
- # - Recipients domains as a function of time
- #
- # Copyright 2015-2018, 2020 George C. Privon
- #
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- import re
- import argparse
- import mailbox
- import datetime
- import numpy as np
- import matplotlib.pyplot as plt
- parser = argparse.ArgumentParser()
- parser.add_argument('mbox', help='Mailbox to analyze.')
- parser.add_argument('--plotroot', '-p', default='email_stats', action='store',
- help='Root name for output plots.')
- parser.add_argument('--title', '-t', default='',
- action='store', help='Plot title root.')
- parser.add_argument('--sendercolors', '-s', default=False, action='store',
- help='Comma separated list of search strings for the \
- sender field. Each will be displayed with a different color.')
- parser.add_argument('--work_hours', action='store',
- default=None,
- help='If provided, shade work hours. Comma-separated \
- list.')
- parser.add_argument('--domains', '-d', action='store',
- default='gmail,hotmail,aol,comcast,yahoo',
- help='Comma separated list of domains to use in \
- categorizing recipient domains.')
- args = parser.parse_args()
- pldata = {}
- if args.sendercolors:
- slist = args.sendercolors.split(',')
- nsend = len(slist)
- for item in slist:
- pldata[item] = []
- pldata['unknown'] = []
- domains = args.domains.split(',')
- senders = {}
- a = mailbox.mbox(args.mbox)
- for msg in a:
- cid = None
- label = None
- if msg['date'] is not None:
- try:
- z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z')
- except ValueError:
- try:
- z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z')
- except ValueError:
- print("Skipping message from " + msg['date'])
- continue
- dateID = r"${0:1d}-{1:02d}$".format(z.year, z.month)
- if dateID not in senders.keys():
- senders[dateID] = np.zeros(len(domains) + 1)
- dmatch = False
- for i, domain in enumerate(domains):
- if not(msg['To'] is None) and re.search(domain + '\.', msg['To']):
- senders[dateID][i+1] += 1
- dmatch = True
- if not dmatch:
- senders[dateID][0] += 1
- if args.sendercolors:
- for search in enumerate(slist):
- if re.search(search[1], msg['From'], re.IGNORECASE):
- cid, label = search
- pldata[label].append([z.date(), z.hour + z.minute/60.])
- break
- if cid is None:
- pldata['unknown'].append([z.date(), z.hour + z.minute/60.])
- # Email send times as a function of day
- fig, ax = plt.subplots(1, 1)
- ax.set_ylim([0, 24])
- ax.set_yticks(4*np.arange(7))
- ax.set_ylabel('Hour', fontsize='large')
- ax.set_xlabel('Date', fontsize='large')
- ax.minorticks_on()
- ax.set_title(args.title + ' - Email Send Times', fontsize='large')
- viridis = plt.get_cmap('viridis')
- fig2, ax2 = plt.subplots(1, 1)
- if args.sendercolors:
- for plid in enumerate(slist):
- ax.plot_date(np.array(pldata[plid[1]])[:, 0],
- np.array(pldata[plid[1]])[:, 1],
- color=viridis(plid[0] / (nsend + 1)),
- marker='.',
- #tz=z.tzname(),
- label=plid[1],
- alpha=0.3,
- xdate=True)
- if len(pldata['unknown']) > 0:
- ax.plot_date(np.array(pldata['unknown'])[:, 0],
- np.array(pldata['unknown'])[:, 1],
- color=viridis(1),
- marker='.',
- #tz=z.tzname(),
- label=None,
- alpha=0.3,
- xdate=True)
- if args.sendercolors:
- ax.legend(loc='upper left', ncol=nsend + 1)
- ax.tick_params(axis='x',
- labelrotation=90)
- if args.work_hours:
- work = [float(x) for x in args.work_hours.split(',')]
- ax.fill_between(ax.get_xlim(),
- y1=work[0],
- y2=work[1],
- color='gray',
- alpha=0.3,
- label='Work Hours')
- ax.legend(loc='best',
- fontsize='small')
- if args.plotroot:
- fig.savefig(args.plotroot + '-send_times.png',
- bbox_inches='tight')
- else:
- fig.savefig('send_times.png',
- bbox_inches='tight')
- # Email destination domains as a function of month
- fig, ax = plt.subplots(1, 1)
- ax.set_ylabel(r'Emails sent to Domain', fontsize='large')
- ax.set_xlabel(r'Year$-$Month', fontsize='large')
- ax.minorticks_on()
- ax.set_title(args.title, fontsize='large')
- months = list(senders.keys())
- months.sort()
- total = np.zeros(len(months))
- for j in range(len(domains) + 1):
- domainlist = []
- if j == 0:
- label = 'Other'
- else:
- label = domains[j - 1]
- for month in months:
- domainlist.append(senders[month][j])
- if np.all(np.array(domainlist) == 0):
- # skip domains with zero emails sent over the entire duration
- continue
- ax.plot(np.arange(len(months)),
- domainlist,
- label=label)
- total += domainlist
- ax.plot(np.arange(len(months)),
- total,
- color='k',
- ls='--',
- alpha=0.4,
- label='All Emails')
- ax.annotate('N={0:1.0f}'.format(np.sum(total)),
- (0.05, 0.95),
- ha='left',
- va='top',
- xycoords='axes fraction')
- ax.legend(frameon=False, loc='best')
- ax.set_xticks(np.arange(len(months))[::6])
- ax.set_xticks(np.arange(len(months)), minor=True)
- ax.set_xticklabels(months[::6])
- ax.tick_params(axis='x',
- labelrotation=90)
- ax.set_ylim(bottom=0)
- if args.plotroot:
- fig.savefig(args.plotroot + '-destinations.png', bbox_inches='tight')
- else:
- fig.savefig('email_destinations.png', bbox_inches='tight')
|