email_stats.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. #!/usr/bin/python
  2. #
  3. # email_stats.py
  4. #
  5. # Analytics on Email mbox files
  6. # - Time vs date plots of emails
  7. # - Recipients domains as a function of time
  8. #
  9. # Copyright 2015-2018, 2020 George C. Privon
  10. #
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. #
  16. # This program is distributed in the hope that it will be useful,
  17. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. # GNU General Public License for more details.
  20. #
  21. # You should have received a copy of the GNU General Public License
  22. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. import re
  24. import argparse
  25. import mailbox
  26. import datetime
  27. import numpy as np
  28. import matplotlib.pyplot as plt
  29. parser = argparse.ArgumentParser()
  30. parser.add_argument('mbox', help='Mailbox to analyze.')
  31. parser.add_argument('--plotroot', '-p', default='email_stats', action='store',
  32. help='Root name for output plots.')
  33. parser.add_argument('--title', '-t', default='',
  34. action='store', help='Plot title root.')
  35. parser.add_argument('--sendercolors', '-s', default=False, action='store',
  36. help='Comma separated list of search strings for the \
  37. sender field. Each will be displayed with a different color.')
  38. parser.add_argument('--work_hours', action='store',
  39. default=None,
  40. help='If provided, shade work hours. Comma-separated \
  41. list.')
  42. parser.add_argument('--domains', '-d', action='store',
  43. default='gmail,hotmail,aol,comcast,yahoo',
  44. help='Comma separated list of domains to use in \
  45. categorizing recipient domains.')
  46. args = parser.parse_args()
  47. pldata = {}
  48. if args.sendercolors:
  49. slist = args.sendercolors.split(',')
  50. nsend = len(slist)
  51. for item in slist:
  52. pldata[item] = []
  53. pldata['unknown'] = []
  54. domains = args.domains.split(',')
  55. senders = {}
  56. a = mailbox.mbox(args.mbox)
  57. for msg in a:
  58. cid = None
  59. label = None
  60. if msg['date'] is not None:
  61. try:
  62. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z')
  63. except ValueError:
  64. try:
  65. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z')
  66. except ValueError:
  67. print("Skipping message from " + msg['date'])
  68. continue
  69. dateID = r"${0:1d}-{1:02d}$".format(z.year, z.month)
  70. if dateID not in senders.keys():
  71. senders[dateID] = np.zeros(len(domains) + 1)
  72. dmatch = False
  73. for i, domain in enumerate(domains):
  74. if not(msg['To'] is None) and re.search(domain + '\.', msg['To']):
  75. senders[dateID][i+1] += 1
  76. dmatch = True
  77. if not dmatch:
  78. senders[dateID][0] += 1
  79. if args.sendercolors:
  80. for search in enumerate(slist):
  81. if re.search(search[1], msg['From'], re.IGNORECASE):
  82. cid, label = search
  83. pldata[label].append([z.date(), z.hour + z.minute/60.])
  84. break
  85. if cid is None:
  86. pldata['unknown'].append([z.date(), z.hour + z.minute/60.])
  87. # Email send times as a function of day
  88. fig, ax = plt.subplots(1, 1)
  89. ax.set_ylim([0, 24])
  90. ax.set_yticks(4*np.arange(7))
  91. ax.set_ylabel('Hour', fontsize='large')
  92. ax.set_xlabel('Date', fontsize='large')
  93. ax.minorticks_on()
  94. ax.set_title(args.title + ' - Email Send Times', fontsize='large')
  95. viridis = plt.get_cmap('viridis')
  96. fig2, ax2 = plt.subplots(1, 1)
  97. if args.sendercolors:
  98. for plid in enumerate(slist):
  99. ax.plot_date(np.array(pldata[plid[1]])[:, 0],
  100. np.array(pldata[plid[1]])[:, 1],
  101. color=viridis(plid[0] / (nsend + 1)),
  102. marker='.',
  103. #tz=z.tzname(),
  104. label=plid[1],
  105. alpha=0.3,
  106. xdate=True)
  107. if len(pldata['unknown']) > 0:
  108. ax.plot_date(np.array(pldata['unknown'])[:, 0],
  109. np.array(pldata['unknown'])[:, 1],
  110. color=viridis(1),
  111. marker='.',
  112. #tz=z.tzname(),
  113. label=None,
  114. alpha=0.3,
  115. xdate=True)
  116. if args.sendercolors:
  117. ax.legend(loc='upper left', ncol=nsend + 1)
  118. ax.tick_params(axis='x',
  119. labelrotation=90)
  120. if args.work_hours:
  121. work = [float(x) for x in args.work_hours.split(',')]
  122. ax.fill_between(ax.get_xlim(),
  123. y1=work[0],
  124. y2=work[1],
  125. color='gray',
  126. alpha=0.3,
  127. label='Work Hours')
  128. ax.legend(loc='best',
  129. fontsize='small')
  130. if args.plotroot:
  131. fig.savefig(args.plotroot + '-send_times.png',
  132. bbox_inches='tight')
  133. else:
  134. fig.savefig('send_times.png',
  135. bbox_inches='tight')
  136. # Email destination domains as a function of month
  137. fig, ax = plt.subplots(1, 1)
  138. ax.set_ylabel(r'Emails sent to Domain', fontsize='large')
  139. ax.set_xlabel(r'Year$-$Month', fontsize='large')
  140. ax.minorticks_on()
  141. ax.set_title(args.title, fontsize='large')
  142. months = list(senders.keys())
  143. months.sort()
  144. total = np.zeros(len(months))
  145. for j in range(len(domains) + 1):
  146. domainlist = []
  147. if j == 0:
  148. label = 'Other'
  149. else:
  150. label = domains[j - 1]
  151. for month in months:
  152. domainlist.append(senders[month][j])
  153. if np.all(np.array(domainlist) == 0):
  154. # skip domains with zero emails sent over the entire duration
  155. continue
  156. ax.plot(np.arange(len(months)),
  157. domainlist,
  158. label=label)
  159. total += domainlist
  160. ax.plot(np.arange(len(months)),
  161. total,
  162. color='k',
  163. ls='--',
  164. alpha=0.4,
  165. label='All Emails')
  166. ax.annotate('N={0:1.0f}'.format(np.sum(total)),
  167. (0.05, 0.95),
  168. ha='left',
  169. va='top',
  170. xycoords='axes fraction')
  171. ax.legend(frameon=False, loc='best')
  172. ax.set_xticks(np.arange(len(months))[::6])
  173. ax.set_xticks(np.arange(len(months)), minor=True)
  174. ax.set_xticklabels(months[::6])
  175. ax.tick_params(axis='x',
  176. labelrotation=90)
  177. ax.set_ylim(bottom=0)
  178. if args.plotroot:
  179. fig.savefig(args.plotroot + '-destinations.png', bbox_inches='tight')
  180. else:
  181. fig.savefig('email_destinations.png', bbox_inches='tight')