email_stats.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. #!/usr/bin/python
  2. #
  3. # email_stats.py
  4. #
  5. # Analytics on Email mbox files
  6. # - Time vs date plots of emails
  7. # - Recipients domains as a function of time
  8. #
  9. # Copyright 2015-2018, 2020 George C. Privon
  10. #
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. #
  16. # This program is distributed in the hope that it will be useful,
  17. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. # GNU General Public License for more details.
  20. #
  21. # You should have received a copy of the GNU General Public License
  22. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. import re
  24. import argparse
  25. import mailbox
  26. import datetime
  27. import numpy as np
  28. import matplotlib.pyplot as plt
  29. parser = argparse.ArgumentParser()
  30. parser.add_argument('mbox', help='Mailbox to analyze.')
  31. parser.add_argument('--plotroot', '-p', default='email_stats', action='store',
  32. help='Root name for output plots.')
  33. parser.add_argument('--title', '-t', default='',
  34. action='store', help='Plot title root.')
  35. parser.add_argument('--sendercolors', '-s', default=False, action='store',
  36. help='Comma separated list of search strings for the \
  37. sender field. Each will be displayed with a different color.')
  38. parser.add_argument('--domains', '-d', action='store',
  39. default='gmail,hotmail,aol,comcast,yahoo',
  40. help='Comma separated list of domains to use in \
  41. categorizing recipient domains.')
  42. args = parser.parse_args()
  43. pldata = {}
  44. if args.sendercolors:
  45. slist = args.sendercolors.split(',')
  46. nsend = len(slist)
  47. for item in slist:
  48. pldata[item] = []
  49. pldata['unknown'] = []
  50. domains = args.domains.split(',')
  51. senders = {}
  52. a = mailbox.mbox(args.mbox)
  53. for msg in a:
  54. cid = None
  55. label = None
  56. if msg['date'] is not None:
  57. try:
  58. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z')
  59. except ValueError:
  60. try:
  61. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z')
  62. except ValueError:
  63. print("Skipping message from " + msg['date'])
  64. continue
  65. dateID = r"${0:1d}-{1:02d}$".format(z.year, z.month)
  66. if dateID not in senders.keys():
  67. senders[dateID] = np.zeros(len(domains) + 1)
  68. dmatch = False
  69. for i, domain in enumerate(domains):
  70. if not(msg['To'] is None) and re.search(domain + '\.', msg['To']):
  71. senders[dateID][i+1] += 1
  72. dmatch = True
  73. if not dmatch:
  74. senders[dateID][0] += 1
  75. if args.sendercolors:
  76. for search in enumerate(slist):
  77. if re.search(search[1], msg['From'], re.IGNORECASE):
  78. cid, label = search
  79. pldata[label].append([z.date(), z.hour + z.minute/60.])
  80. break
  81. if cid is None:
  82. pldata['unknown'].append([z.date(), z.hour + z.minute/60.])
  83. # Email send times as a function of day
  84. fig, ax = plt.subplots(1, 1)
  85. ax.set_ylim([0, 24])
  86. ax.set_yticks(4*np.arange(7))
  87. ax.set_ylabel('Hour', fontsize='large')
  88. ax.set_xlabel('Date', fontsize='large')
  89. ax.minorticks_on()
  90. ax.set_title(args.title + ' - Email Send Times', fontsize='large')
  91. viridis = plt.get_cmap('viridis')
  92. fig2, ax2 = plt.subplots(1, 1)
  93. if args.sendercolors:
  94. for plid in enumerate(slist):
  95. ax.plot_date(np.array(pldata[plid[1]])[:, 0],
  96. np.array(pldata[plid[1]])[:, 1],
  97. color=viridis(plid[0] / (nsend + 1)),
  98. marker='.',
  99. #tz=z.tzname(),
  100. label=plid[1],
  101. alpha=0.3,
  102. xdate=True)
  103. if len(pldata['unknown']) > 0:
  104. ax.plot_date(np.array(pldata['unknown'])[:, 0],
  105. np.array(pldata['unknown'])[:, 1],
  106. color=viridis(1),
  107. marker='.',
  108. #tz=z.tzname(),
  109. label='unknown',
  110. alpha=0.3,
  111. xdate=True)
  112. if args.sendercolors:
  113. ax.legend(loc='upper left', ncol=nsend + 1)
  114. ax.tick_params(axis='x',
  115. labelrotation=90)
  116. if args.plotroot:
  117. fig.savefig(args.plotroot + '-send_times.png',
  118. bbox_inches='tight')
  119. else:
  120. fig.savefig('send_times.png',
  121. bbox_inches='tight')
  122. # Email destination domains as a function of month
  123. fig, ax = plt.subplots(1, 1)
  124. ax.set_ylabel(r'Emails sent to Domain', fontsize='large')
  125. ax.set_xlabel(r'Year$-$Month', fontsize='large')
  126. ax.minorticks_on()
  127. ax.set_title(args.title, fontsize='large')
  128. months = list(senders.keys())
  129. months.sort()
  130. total = np.zeros(len(months))
  131. for j in range(len(domains) + 1):
  132. domainlist = []
  133. if j == 0:
  134. label = 'Other'
  135. else:
  136. label = domains[j - 1]
  137. for month in months:
  138. domainlist.append(senders[month][j])
  139. if np.all(np.array(domainlist) == 0):
  140. # skip domains with zero emails sent over the entire duration
  141. continue
  142. ax.plot(np.arange(len(months)),
  143. domainlist,
  144. label=label)
  145. total += domainlist
  146. ax.plot(np.arange(len(months)),
  147. total,
  148. color='k',
  149. ls='--',
  150. alpha=0.4,
  151. label='All Emails')
  152. ax.annotate('N={0:1.0f}'.format(np.sum(total)),
  153. (0.99, 0.99),
  154. ha='right',
  155. va='top',
  156. xycoords='axes fraction')
  157. ax.legend(frameon=False, loc='best')
  158. ax.set_xticks(np.arange(len(months))[::6])
  159. ax.set_xticks(np.arange(len(months)), minor=True)
  160. ax.set_xticklabels(months[::6])
  161. ax.tick_params(axis='x',
  162. labelrotation=90)
  163. ax.set_ylim(bottom=0)
  164. if args.plotroot:
  165. fig.savefig(args.plotroot + '-destinations.png', bbox_inches='tight')
  166. else:
  167. fig.savefig('email_destinations.png', bbox_inches='tight')