Prechádzať zdrojové kódy

merge email_destinations into email_stats

George C. Privon 10 rokov pred
rodič
commit
2721c84eca
3 zmenil súbory, kde vykonal 81 pridanie a 130 odobranie
  1. 5 4
      README.md
  2. 0 104
      email_destinations.py
  3. 76 22
      email_stats.py

+ 5 - 4
README.md

@@ -5,6 +5,7 @@ A short script to do simple analysis of mbox email files.
 Currently the script:
 
 * plots time vs day of all emails, optionally color-coded by user-specified sender information (example below)
+* plots the number of emails delivered to specified domains, as a function of time
 
 In the future, I would like to add capabilities to:
 
@@ -22,7 +23,7 @@ For usage information:
 
 ```
 $ python email_stats.py -h
-usage: email_stats.py [-h] [--plotfile PLOTFILE] [--title TITLE]
+usage: email_stats.py [-h] [--plotroot PLOTROOT] [--title TITLE]
                       [--sendercolors SENDERCOLORS]
                       mbox
 
@@ -31,10 +32,10 @@ positional arguments:
 
 optional arguments:
   -h, --help            show this help message and exit
-  --plotfile PLOTFILE, -p PLOTFILE
-                        Name of output plotting file.
+  --plotroot PLOTROOT, -p PLOTROOT
+                        Root name for output plots.
   --title TITLE, -t TITLE
-                        Plot title.
+                        Plot title root.
   --sendercolors SENDERCOLORS, -s SENDERCOLORS
                         Comma separated list of search strings for the sender
                         field. Each will be displayed with a different color.

+ 0 - 104
email_destinations.py

@@ -1,104 +0,0 @@
-#!/usr/bin/python
-#
-# email_destinations.py
-#
-# Emails sent to specified domains as a function of time.
-#
-# Copyright (C) 2015 George C. Privon
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
-import mailbox
-import datetime
-import matplotlib.pyplot as plt
-import matplotlib.dates as mdates
-import argparse
-import re
-import numpy as np
-import cubehelix
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument('mbox', help='Mailbox to analyze.')
-parser.add_argument('--plotfile', '-p', default=False, action='store',
-                    help='Name of output plotting file.')
-parser.add_argument('--title', '-t', default='Email Destinations',
-                    action='store', help='Plot title.')
-args = parser.parse_args()
-
-plt.figure()
-plt.ylabel('Emails sent to Domain')
-plt.xlabel('Year-Month')
-plt.minorticks_on()
-plt.title(args.title)
-a = mailbox.mbox(args.mbox)
-
-scolor = cubehelix.cmap(startHue=240, endHue=-300,
-                        minSat=1, maxSat=2.5,
-                        minLight=.3, maxLight=.8,
-                        gamma=.9)
-
-domains = ['gmail',
-           'hotmail',
-           'aol']
-senders = {}
-for msg in a:
-    cid = None
-    label = None
-    if msg['date'] is not None:
-        try:
-            z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z')
-        except ValueError:
-            try:
-                z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z')
-            except ValueError:
-                print("Skipping message from " + msg['date'])
-                continue
-        dateID = "{0:1d}-{1:2d}".format(z.year, z.month)
-        if not(dateID in senders.keys()):
-            senders[dateID] = np.zeros(len(domains) + 1)
-        dmatch = False
-        for i, domain in enumerate(domains):
-            if not(msg['To'] is None) and re.search(domain, msg['To']):
-                senders[dateID][i+1] += 1
-                dmatch = True
-        if not(dmatch):
-            senders[dateID][0] += 1
-
-months = list(senders.keys())
-months.sort()
-
-for j in range(len(domains) + 1):
-    domainlist = []
-    if j == 0:
-        label = 'Other'
-    else:
-        label = domains[j - 1]
-
-    for month in months:
-        domainlist.append(senders[month][j])
-    plt.plot(np.arange(len(months)),
-             domainlist,
-             label=label)
-
-plt.legend(frameon=False, loc='best')
-plt.xticks(np.arange(len(months)), months, rotation=90)
-plt.setp(plt.axes().get_xticklabels(), visible=False)
-plt.setp(plt.axes().get_xticklabels()[::6], visible=True)
-
-if args.plotfile:
-    plt.savefig(args.plotfile, bbox_inches='tight')
-else:
-    plt.savefig('email_destinations.png', bbox_inches='tight')

+ 76 - 22
email_stats.py

@@ -2,7 +2,9 @@
 #
 # email_stats.py
 #
-# Time vs date plot for an email mbox file.
+# Analytics on Email mbox files
+# - Time vs date plots of emails
+# - Recipients domains as a function of time
 #
 # Copyright (C) 2015 George C. Privon
 #
@@ -32,30 +34,16 @@ import cubehelix
 
 parser = argparse.ArgumentParser()
 parser.add_argument('mbox', help='Mailbox to analyze.')
-parser.add_argument('--plotfile', '-p', default=False, action='store',
-                    help='Name of output plotting file.')
-parser.add_argument('--title', '-t', default='Email Send Times',
-                    action='store', help='Plot title.')
+parser.add_argument('--plotroot', '-p', default='email_stats', action='store',
+                    help='Root name for output plots.')
+parser.add_argument('--title', '-t', default='',
+                    action='store', help='Plot title root.')
 parser.add_argument('--sendercolors', '-s', default=False, action='store',
                     help='Comma separated list of search strings for the \
                           sender field. Each will be displayed with a \
                           different color.')
 args = parser.parse_args()
 
-plt.figure()
-plt.ylim([0, 24])
-plt.yticks(4*np.arange(7))
-plt.ylabel('Hour')
-plt.xlabel('Date')
-plt.minorticks_on()
-plt.title(args.title)
-a = mailbox.mbox(args.mbox)
-
-scolor = cubehelix.cmap(startHue=240, endHue=-300,
-                        minSat=1, maxSat=2.5,
-                        minLight=.3, maxLight=.8,
-                        gamma=.9)
-
 pldata = {}
 if args.sendercolors:
     slist = args.sendercolors.split(',')
@@ -64,6 +52,13 @@ if args.sendercolors:
         pldata[item] = []
 pldata['unknown'] = []
 
+domains = ['gmail',
+           'hotmail',
+           'aol']
+senders = {}
+
+a = mailbox.mbox(args.mbox)
+
 for msg in a:
     cid = None
     label = None
@@ -76,6 +71,16 @@ for msg in a:
             except ValueError:
                 print("Skipping message from " + msg['date'])
                 continue
+        dateID = "{0:1d}-{1:2d}".format(z.year, z.month)
+        if not(dateID in senders.keys()):
+            senders[dateID] = np.zeros(len(domains) + 1)
+        dmatch = False
+        for i, domain in enumerate(domains):
+            if not(msg['To'] is None) and re.search(domain, msg['To']):
+                senders[dateID][i+1] += 1
+                dmatch = True
+        if not(dmatch):
+            senders[dateID][0] += 1
         if args.sendercolors:
             for search in enumerate(slist):
                 if re.search(search[1], msg['From'], re.IGNORECASE):
@@ -85,6 +90,21 @@ for msg in a:
         if cid is None:
             pldata['unknown'].append([z.date(), z.hour + z.minute/60.])
 
+# Email send times as a function of day
+
+plt.figure()
+plt.ylim([0, 24])
+plt.yticks(4*np.arange(7))
+plt.ylabel('Hour')
+plt.xlabel('Date')
+plt.minorticks_on()
+plt.title(args.title + ' - Email Send Times')
+
+scolor = cubehelix.cmap(startHue=240, endHue=-300,
+                        minSat=1, maxSat=2.5,
+                        minLight=.3, maxLight=.8,
+                        gamma=.9)
+
 if args.sendercolors:
     for plid in enumerate(slist):
         plt.plot_date(np.array(pldata[plid[1]])[:,0],
@@ -107,7 +127,41 @@ if len(pldata['unknown']) > 0:
 if args.sendercolors:
     plt.legend(loc='upper left', ncol=nsend + 1)
 
-if args.plotfile:
-    plt.savefig(args.plotfile)
+if args.plotroot:
+    plt.savefig(args.plotroot + '-send_times.png')
+else:
+    plt.savefig('send_times.png')
+
+
+# Email destination domains as a function of month
+plt.figure()
+plt.ylabel('Emails sent to Domain')
+plt.xlabel('Year-Month')
+plt.minorticks_on()
+plt.title(args.title)
+
+months = list(senders.keys())
+months.sort()
+
+for j in range(len(domains) + 1):
+    domainlist = []
+    if j == 0:
+        label = 'Other'
+    else:
+        label = domains[j - 1]
+
+    for month in months:
+        domainlist.append(senders[month][j])
+    plt.plot(np.arange(len(months)),
+             domainlist,
+             label=label)
+
+plt.legend(frameon=False, loc='best')
+plt.xticks(np.arange(len(months)), months, rotation=90)
+plt.setp(plt.axes().get_xticklabels(), visible=False)
+plt.setp(plt.axes().get_xticklabels()[::6], visible=True)
+
+if args.plotroot:
+    plt.savefig(args.plotroot + '-destinations.png', bbox_inches='tight')
 else:
-    plt.savefig('email_times.png')
+    plt.savefig('email_destinations.png', bbox_inches='tight')