#!/usr/bin/python # # email_stats.py # # Analytics on Email mbox files # - Time vs date plots of emails # - Recipients domains as a function of time # # Copyright 2015-2017 George C. Privon # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import mailbox import datetime import matplotlib.pyplot as plt import argparse import re import numpy as np parser = argparse.ArgumentParser() parser.add_argument('mbox', help='Mailbox to analyze.') parser.add_argument('--plotroot', '-p', default='email_stats', action='store', help='Root name for output plots.') parser.add_argument('--title', '-t', default='', action='store', help='Plot title root.') parser.add_argument('--sendercolors', '-s', default=False, action='store', help='Comma separated list of search strings for the \ sender field. Each will be displayed with a \ different color.') args = parser.parse_args() pldata = {} if args.sendercolors: slist = args.sendercolors.split(',') nsend = len(slist) for item in slist: pldata[item] = [] pldata['unknown'] = [] domains = ['gmail', 'hotmail', 'aol', 'comcast', 'yahoo', 'privon'] senders = {} a = mailbox.mbox(args.mbox) for msg in a: cid = None label = None if msg['date'] is not None: try: z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z') except ValueError: try: z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z') except ValueError: print("Skipping message from " + msg['date']) continue dateID = r"${0:1d}-{1:02d}$".format(z.year, z.month) if dateID not in senders.keys(): senders[dateID] = np.zeros(len(domains) + 1) dmatch = False for i, domain in enumerate(domains): if not(msg['To'] is None) and re.search(domain + '\.', msg['To']): senders[dateID][i+1] += 1 dmatch = True if not dmatch: senders[dateID][0] += 1 if args.sendercolors: for search in enumerate(slist): if re.search(search[1], msg['From'], re.IGNORECASE): cid, label = search pldata[label].append([z.date(), z.hour + z.minute/60.]) break if cid is None: pldata['unknown'].append([z.date(), z.hour + z.minute/60.]) # Email send times as a function of day plt.figure() plt.ylim([0, 24]) plt.yticks(4*np.arange(7)) plt.ylabel('Hour', fontsize='large') plt.xlabel('Date', fontsize='large') plt.minorticks_on() plt.title(args.title + ' - Email Send Times', fontsize='large') viridis = plt.get_cmap('viridis') if args.sendercolors: for plid in enumerate(slist): plt.plot_date(np.array(pldata[plid[1]])[:, 0], np.array(pldata[plid[1]])[:, 1], color=viridis(plid[0] / (nsend + 1)), marker='.', #tz=z.tzname(), label=plid[1], xdate=True) if len(pldata['unknown']) > 0: plt.plot_date(np.array(pldata['unknown'])[:, 0], np.array(pldata['unknown'])[:, 1], color=viridis(1), marker='.', #tz=z.tzname(), label='unknown', xdate=True) if args.sendercolors: plt.legend(loc='upper left', ncol=nsend + 1) if args.plotroot: plt.savefig(args.plotroot + '-send_times.png') else: plt.savefig('send_times.png') # Email destination domains as a function of month plt.figure() plt.ylabel(r'Emails sent to Domain', fontsize='large') plt.xlabel(r'Year$-$Month', fontsize='large') plt.minorticks_on() plt.title(args.title, fontsize='large') months = list(senders.keys()) months.sort() total = np.zeros(len(months)) for j in range(len(domains) + 1): domainlist = [] if j == 0: label = 'Other' else: label = domains[j - 1] for month in months: domainlist.append(senders[month][j]) if np.all(np.array(domainlist) == 0): # skip domains with zero emails sent over the entire duration continue plt.plot(np.arange(len(months)), domainlist, label=label) total += domainlist plt.plot(np.arange(len(months)), total, color='k', ls='--', alpha=0.4, label='All Emails') plt.legend(frameon=False, loc='best') plt.xticks(np.arange(len(months)), months, rotation=90) plt.setp(plt.axes().get_xticklabels(), visible=False) plt.setp(plt.axes().get_xticklabels()[::6], visible=True) plt.ylim(bottom=0) if args.plotroot: plt.savefig(args.plotroot + '-destinations.png', bbox_inches='tight') else: plt.savefig('email_destinations.png', bbox_inches='tight')