#!/usr/bin/python # # email_stats.py # # Analytics on Email mbox files # - Time vs date plots of emails # - Recipients domains as a function of time # # Copyright 2015-2018, 2020 George C. Privon # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import re import argparse import mailbox import datetime import numpy as np import matplotlib.pyplot as plt parser = argparse.ArgumentParser() parser.add_argument('mbox', help='Mailbox to analyze.') parser.add_argument('--plotroot', '-p', default='email_stats', action='store', help='Root name for output plots.') parser.add_argument('--title', '-t', default='', action='store', help='Plot title root.') parser.add_argument('--sendercolors', '-s', default=False, action='store', help='Comma separated list of search strings for the \ sender field. Each will be displayed with a different color.') parser.add_argument('--work_hours', action='store', default=None, help='If provided, shade work hours. Comma-separated \ list.') parser.add_argument('--domains', '-d', action='store', default='gmail,hotmail,aol,comcast,yahoo', help='Comma separated list of domains to use in \ categorizing recipient domains.') args = parser.parse_args() pldata = {} if args.sendercolors: slist = args.sendercolors.split(',') nsend = len(slist) for item in slist: pldata[item] = [] pldata['unknown'] = [] domains = args.domains.split(',') senders = {} a = mailbox.mbox(args.mbox) for msg in a: cid = None label = None if msg['date'] is not None: try: z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z') except ValueError: try: z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z') except ValueError: print("Skipping message from " + msg['date']) continue dateID = r"${0:1d}-{1:02d}$".format(z.year, z.month) if dateID not in senders.keys(): senders[dateID] = np.zeros(len(domains) + 1) dmatch = False for i, domain in enumerate(domains): if not(msg['To'] is None) and re.search(domain + '\.', msg['To']): senders[dateID][i+1] += 1 dmatch = True if not dmatch: senders[dateID][0] += 1 if args.sendercolors: for search in enumerate(slist): if re.search(search[1], msg['From'], re.IGNORECASE): cid, label = search pldata[label].append([z.date(), z.hour + z.minute/60.]) break if cid is None: pldata['unknown'].append([z.date(), z.hour + z.minute/60.]) # Email send times as a function of day fig, ax = plt.subplots(1, 1) ax.set_ylim([0, 24]) ax.set_yticks(4*np.arange(7)) ax.set_ylabel('Hour', fontsize='large') ax.set_xlabel('Date', fontsize='large') ax.minorticks_on() ax.set_title(args.title + ' - Email Send Times', fontsize='large') viridis = plt.get_cmap('viridis') fig2, ax2 = plt.subplots(1, 1) if args.sendercolors: for plid in enumerate(slist): ax.plot_date(np.array(pldata[plid[1]])[:, 0], np.array(pldata[plid[1]])[:, 1], color=viridis(plid[0] / (nsend + 1)), marker='.', #tz=z.tzname(), label=plid[1], alpha=0.3, xdate=True) if len(pldata['unknown']) > 0: ax.plot_date(np.array(pldata['unknown'])[:, 0], np.array(pldata['unknown'])[:, 1], color=viridis(1), marker='.', #tz=z.tzname(), label=None, alpha=0.3, xdate=True) if args.sendercolors: ax.legend(loc='upper left', ncol=nsend + 1) ax.tick_params(axis='x', labelrotation=90) if args.work_hours: work = [float(x) for x in args.work_hours.split(',')] ax.fill_between(ax.get_xlim(), y1=work[0], y2=work[1], color='gray', alpha=0.3, label='Work Hours') ax.legend(loc='best', fontsize='small') if args.plotroot: fig.savefig(args.plotroot + '-send_times.png', bbox_inches='tight') else: fig.savefig('send_times.png', bbox_inches='tight') # Email destination domains as a function of month fig, ax = plt.subplots(1, 1) ax.set_ylabel(r'Emails sent to Domain', fontsize='large') ax.set_xlabel(r'Year$-$Month', fontsize='large') ax.minorticks_on() ax.set_title(args.title, fontsize='large') months = list(senders.keys()) months.sort() total = np.zeros(len(months)) for j in range(len(domains) + 1): domainlist = [] if j == 0: label = 'Other' else: label = domains[j - 1] for month in months: domainlist.append(senders[month][j]) if np.all(np.array(domainlist) == 0): # skip domains with zero emails sent over the entire duration continue ax.plot(np.arange(len(months)), domainlist, label=label) total += domainlist ax.plot(np.arange(len(months)), total, color='k', ls='--', alpha=0.4, label='All Emails') ax.annotate('N={0:1.0f}'.format(np.sum(total)), (0.05, 0.95), ha='left', va='top', xycoords='axes fraction') ax.legend(frameon=False, loc='best') ax.set_xticks(np.arange(len(months))[::6]) ax.set_xticks(np.arange(len(months)), minor=True) ax.set_xticklabels(months[::6]) ax.tick_params(axis='x', labelrotation=90) ax.set_ylim(bottom=0) if args.plotroot: fig.savefig(args.plotroot + '-destinations.png', bbox_inches='tight') else: fig.savefig('email_destinations.png', bbox_inches='tight')