email_stats.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. #!/usr/bin/python
  2. #
  3. # email_stats.py
  4. #
  5. # Analytics on Email mbox files
  6. # - Time vs date plots of emails
  7. # - Recipients domains as a function of time
  8. #
  9. # Copyright (C) 2015 George C. Privon
  10. #
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. #
  16. # This program is distributed in the hope that it will be useful,
  17. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. # GNU General Public License for more details.
  20. #
  21. # You should have received a copy of the GNU General Public License
  22. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. import mailbox
  24. import datetime
  25. import matplotlib.pyplot as plt
  26. import matplotlib.dates as mdates
  27. import argparse
  28. import re
  29. import numpy as np
  30. import cubehelix
  31. parser = argparse.ArgumentParser()
  32. parser.add_argument('mbox', help='Mailbox to analyze.')
  33. parser.add_argument('--plotroot', '-p', default='email_stats', action='store',
  34. help='Root name for output plots.')
  35. parser.add_argument('--title', '-t', default='',
  36. action='store', help='Plot title root.')
  37. parser.add_argument('--sendercolors', '-s', default=False, action='store',
  38. help='Comma separated list of search strings for the \
  39. sender field. Each will be displayed with a \
  40. different color.')
  41. args = parser.parse_args()
  42. pldata = {}
  43. if args.sendercolors:
  44. slist = args.sendercolors.split(',')
  45. nsend = len(slist)
  46. for item in slist:
  47. pldata[item] = []
  48. pldata['unknown'] = []
  49. domains = ['gmail',
  50. 'hotmail',
  51. 'aol']
  52. senders = {}
  53. a = mailbox.mbox(args.mbox)
  54. for msg in a:
  55. cid = None
  56. label = None
  57. if msg['date'] is not None:
  58. try:
  59. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z')
  60. except ValueError:
  61. try:
  62. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z')
  63. except ValueError:
  64. print("Skipping message from " + msg['date'])
  65. continue
  66. dateID = "{0:1d}-{1:2d}".format(z.year, z.month)
  67. if not(dateID in senders.keys()):
  68. senders[dateID] = np.zeros(len(domains) + 1)
  69. dmatch = False
  70. for i, domain in enumerate(domains):
  71. if not(msg['To'] is None) and re.search(domain, msg['To']):
  72. senders[dateID][i+1] += 1
  73. dmatch = True
  74. if not(dmatch):
  75. senders[dateID][0] += 1
  76. if args.sendercolors:
  77. for search in enumerate(slist):
  78. if re.search(search[1], msg['From'], re.IGNORECASE):
  79. cid, label = search
  80. pldata[label].append([z.date(), z.hour + z.minute/60.])
  81. break
  82. if cid is None:
  83. pldata['unknown'].append([z.date(), z.hour + z.minute/60.])
  84. # Email send times as a function of day
  85. plt.figure()
  86. plt.ylim([0, 24])
  87. plt.yticks(4*np.arange(7))
  88. plt.ylabel('Hour')
  89. plt.xlabel('Date')
  90. plt.minorticks_on()
  91. plt.title(args.title + ' - Email Send Times')
  92. scolor = cubehelix.cmap(startHue=240, endHue=-300,
  93. minSat=1, maxSat=2.5,
  94. minLight=.3, maxLight=.8,
  95. gamma=.9)
  96. if args.sendercolors:
  97. for plid in enumerate(slist):
  98. plt.plot_date(np.array(pldata[plid[1]])[:,0],
  99. np.array(pldata[plid[1]])[:,1],
  100. color=scolor(plid[0] / (nsend + 1)),
  101. marker='.',
  102. #tz=z.tzname(),
  103. label=plid[1],
  104. xdate=True)
  105. if len(pldata['unknown']) > 0:
  106. plt.plot_date(np.array(pldata['unknown'])[:,0],
  107. np.array(pldata['unknown'])[:,1],
  108. color=scolor(1),
  109. marker='.',
  110. #tz=z.tzname(),
  111. label='unknown',
  112. xdate=True)
  113. if args.sendercolors:
  114. plt.legend(loc='upper left', ncol=nsend + 1)
  115. if args.plotroot:
  116. plt.savefig(args.plotroot + '-send_times.png')
  117. else:
  118. plt.savefig('send_times.png')
  119. # Email destination domains as a function of month
  120. plt.figure()
  121. plt.ylabel('Emails sent to Domain')
  122. plt.xlabel('Year-Month')
  123. plt.minorticks_on()
  124. plt.title(args.title)
  125. months = list(senders.keys())
  126. months.sort()
  127. for j in range(len(domains) + 1):
  128. domainlist = []
  129. if j == 0:
  130. label = 'Other'
  131. else:
  132. label = domains[j - 1]
  133. for month in months:
  134. domainlist.append(senders[month][j])
  135. plt.plot(np.arange(len(months)),
  136. domainlist,
  137. label=label)
  138. plt.legend(frameon=False, loc='best')
  139. plt.xticks(np.arange(len(months)), months, rotation=90)
  140. plt.setp(plt.axes().get_xticklabels(), visible=False)
  141. plt.setp(plt.axes().get_xticklabels()[::6], visible=True)
  142. if args.plotroot:
  143. plt.savefig(args.plotroot + '-destinations.png', bbox_inches='tight')
  144. else:
  145. plt.savefig('email_destinations.png', bbox_inches='tight')