email_stats.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. #!/usr/bin/python
  2. #
  3. # email_stats.py
  4. #
  5. # Analytics on Email mbox files
  6. # - Time vs date plots of emails
  7. # - Recipients domains as a function of time
  8. #
  9. # Copyright 2015-2017 George C. Privon
  10. #
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. #
  16. # This program is distributed in the hope that it will be useful,
  17. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. # GNU General Public License for more details.
  20. #
  21. # You should have received a copy of the GNU General Public License
  22. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. import mailbox
  24. import datetime
  25. import matplotlib.pyplot as plt
  26. import argparse
  27. import re
  28. import numpy as np
  29. parser = argparse.ArgumentParser()
  30. parser.add_argument('mbox', help='Mailbox to analyze.')
  31. parser.add_argument('--plotroot', '-p', default='email_stats', action='store',
  32. help='Root name for output plots.')
  33. parser.add_argument('--title', '-t', default='',
  34. action='store', help='Plot title root.')
  35. parser.add_argument('--sendercolors', '-s', default=False, action='store',
  36. help='Comma separated list of search strings for the \
  37. sender field. Each will be displayed with a \
  38. different color.')
  39. args = parser.parse_args()
  40. pldata = {}
  41. if args.sendercolors:
  42. slist = args.sendercolors.split(',')
  43. nsend = len(slist)
  44. for item in slist:
  45. pldata[item] = []
  46. pldata['unknown'] = []
  47. domains = ['gmail',
  48. 'hotmail',
  49. 'aol',
  50. 'comcast',
  51. 'yahoo',
  52. 'privon']
  53. senders = {}
  54. a = mailbox.mbox(args.mbox)
  55. for msg in a:
  56. cid = None
  57. label = None
  58. if msg['date'] is not None:
  59. try:
  60. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z')
  61. except ValueError:
  62. try:
  63. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z')
  64. except ValueError:
  65. print("Skipping message from " + msg['date'])
  66. continue
  67. dateID = r"${0:1d}-{1:02d}$".format(z.year, z.month)
  68. if dateID not in senders.keys():
  69. senders[dateID] = np.zeros(len(domains) + 1)
  70. dmatch = False
  71. for i, domain in enumerate(domains):
  72. if not(msg['To'] is None) and re.search(domain + '\.', msg['To']):
  73. senders[dateID][i+1] += 1
  74. dmatch = True
  75. if not dmatch:
  76. senders[dateID][0] += 1
  77. if args.sendercolors:
  78. for search in enumerate(slist):
  79. if re.search(search[1], msg['From'], re.IGNORECASE):
  80. cid, label = search
  81. pldata[label].append([z.date(), z.hour + z.minute/60.])
  82. break
  83. if cid is None:
  84. pldata['unknown'].append([z.date(), z.hour + z.minute/60.])
  85. # Email send times as a function of day
  86. plt.figure()
  87. plt.ylim([0, 24])
  88. plt.yticks(4*np.arange(7))
  89. plt.ylabel('Hour', fontsize='large')
  90. plt.xlabel('Date', fontsize='large')
  91. plt.minorticks_on()
  92. plt.title(args.title + ' - Email Send Times', fontsize='large')
  93. viridis = plt.get_cmap('viridis')
  94. if args.sendercolors:
  95. for plid in enumerate(slist):
  96. plt.plot_date(np.array(pldata[plid[1]])[:, 0],
  97. np.array(pldata[plid[1]])[:, 1],
  98. color=viridis(plid[0] / (nsend + 1)),
  99. marker='.',
  100. #tz=z.tzname(),
  101. label=plid[1],
  102. xdate=True)
  103. if len(pldata['unknown']) > 0:
  104. plt.plot_date(np.array(pldata['unknown'])[:, 0],
  105. np.array(pldata['unknown'])[:, 1],
  106. color=viridis(1),
  107. marker='.',
  108. #tz=z.tzname(),
  109. label='unknown',
  110. xdate=True)
  111. if args.sendercolors:
  112. plt.legend(loc='upper left', ncol=nsend + 1)
  113. if args.plotroot:
  114. plt.savefig(args.plotroot + '-send_times.png')
  115. else:
  116. plt.savefig('send_times.png')
  117. # Email destination domains as a function of month
  118. plt.figure()
  119. plt.ylabel(r'Emails sent to Domain', fontsize='large')
  120. plt.xlabel(r'Year$-$Month', fontsize='large')
  121. plt.minorticks_on()
  122. plt.title(args.title, fontsize='large')
  123. months = list(senders.keys())
  124. months.sort()
  125. total = np.zeros(len(months))
  126. for j in range(len(domains) + 1):
  127. domainlist = []
  128. if j == 0:
  129. label = 'Other'
  130. else:
  131. label = domains[j - 1]
  132. for month in months:
  133. domainlist.append(senders[month][j])
  134. if np.all(np.array(domainlist) == 0):
  135. # skip domains with zero emails sent over the entire duration
  136. continue
  137. plt.plot(np.arange(len(months)),
  138. domainlist,
  139. label=label)
  140. total += domainlist
  141. plt.plot(np.arange(len(months)),
  142. total,
  143. color='k',
  144. ls='--',
  145. alpha=0.4,
  146. label='All Emails')
  147. plt.annotate('N={0:1.0f}'.format(np.sum(total)),
  148. (0.99, 0.99),
  149. ha='right',
  150. va='top',
  151. xycoords='axes fraction')
  152. plt.legend(frameon=False, loc='best')
  153. plt.xticks(np.arange(len(months)), months, rotation=90)
  154. plt.setp(plt.axes().get_xticklabels(), visible=False)
  155. plt.setp(plt.axes().get_xticklabels()[::6], visible=True)
  156. plt.ylim(bottom=0)
  157. if args.plotroot:
  158. plt.savefig(args.plotroot + '-destinations.png', bbox_inches='tight')
  159. else:
  160. plt.savefig('email_destinations.png', bbox_inches='tight')