email_stats.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. #!/usr/bin/python
  2. #
  3. # email_stats.py
  4. #
  5. # Analytics on Email mbox files
  6. # - Time vs date plots of emails
  7. # - Recipients domains as a function of time
  8. #
  9. # Copyright 2015-2017 George C. Privon
  10. #
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. #
  16. # This program is distributed in the hope that it will be useful,
  17. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. # GNU General Public License for more details.
  20. #
  21. # You should have received a copy of the GNU General Public License
  22. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. import mailbox
  24. import datetime
  25. import matplotlib.pyplot as plt
  26. import argparse
  27. import re
  28. import numpy as np
  29. import cubehelix
  30. parser = argparse.ArgumentParser()
  31. parser.add_argument('mbox', help='Mailbox to analyze.')
  32. parser.add_argument('--plotroot', '-p', default='email_stats', action='store',
  33. help='Root name for output plots.')
  34. parser.add_argument('--title', '-t', default='',
  35. action='store', help='Plot title root.')
  36. parser.add_argument('--sendercolors', '-s', default=False, action='store',
  37. help='Comma separated list of search strings for the \
  38. sender field. Each will be displayed with a \
  39. different color.')
  40. args = parser.parse_args()
  41. pldata = {}
  42. if args.sendercolors:
  43. slist = args.sendercolors.split(',')
  44. nsend = len(slist)
  45. for item in slist:
  46. pldata[item] = []
  47. pldata['unknown'] = []
  48. domains = ['gmail',
  49. 'hotmail',
  50. 'aol']
  51. senders = {}
  52. a = mailbox.mbox(args.mbox)
  53. for msg in a:
  54. cid = None
  55. label = None
  56. if msg['date'] is not None:
  57. try:
  58. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z')
  59. except ValueError:
  60. try:
  61. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z')
  62. except ValueError:
  63. print("Skipping message from " + msg['date'])
  64. continue
  65. dateID = r"${0:1d}-{1:02d}$".format(z.year, z.month)
  66. if dateID not in senders.keys():
  67. senders[dateID] = np.zeros(len(domains) + 1)
  68. dmatch = False
  69. for i, domain in enumerate(domains):
  70. if not(msg['To'] is None) and re.search(domain, msg['To']):
  71. senders[dateID][i+1] += 1
  72. dmatch = True
  73. if not dmatch:
  74. senders[dateID][0] += 1
  75. if args.sendercolors:
  76. for search in enumerate(slist):
  77. if re.search(search[1], msg['From'], re.IGNORECASE):
  78. cid, label = search
  79. pldata[label].append([z.date(), z.hour + z.minute/60.])
  80. break
  81. if cid is None:
  82. pldata['unknown'].append([z.date(), z.hour + z.minute/60.])
  83. # Email send times as a function of day
  84. plt.figure()
  85. plt.ylim([0, 24])
  86. plt.yticks(4*np.arange(7))
  87. plt.ylabel('Hour', fontsize='large')
  88. plt.xlabel('Date', fontsize='large')
  89. plt.minorticks_on()
  90. plt.title(args.title + ' - Email Send Times', fontsize='large')
  91. scolor = cubehelix.cmap(startHue=240, endHue=-300,
  92. minSat=1, maxSat=2.5,
  93. minLight=.3, maxLight=.8,
  94. gamma=.9)
  95. if args.sendercolors:
  96. for plid in enumerate(slist):
  97. plt.plot_date(np.array(pldata[plid[1]])[:, 0],
  98. np.array(pldata[plid[1]])[:, 1],
  99. color=scolor(plid[0] / (nsend + 1)),
  100. marker='.',
  101. #tz=z.tzname(),
  102. label=plid[1],
  103. xdate=True)
  104. if len(pldata['unknown']) > 0:
  105. plt.plot_date(np.array(pldata['unknown'])[:, 0],
  106. np.array(pldata['unknown'])[:, 1],
  107. color=scolor(1),
  108. marker='.',
  109. #tz=z.tzname(),
  110. label='unknown',
  111. xdate=True)
  112. if args.sendercolors:
  113. plt.legend(loc='upper left', ncol=nsend + 1)
  114. if args.plotroot:
  115. plt.savefig(args.plotroot + '-send_times.png')
  116. else:
  117. plt.savefig('send_times.png')
  118. # Email destination domains as a function of month
  119. plt.figure()
  120. plt.ylabel(r'Emails sent to Domain', fontsize='large')
  121. plt.xlabel(r'Year$-$Month', fontsize='large')
  122. plt.minorticks_on()
  123. plt.title(args.title, fontsize='large')
  124. months = list(senders.keys())
  125. months.sort()
  126. for j in range(len(domains) + 1):
  127. domainlist = []
  128. if j == 0:
  129. label = 'Other'
  130. else:
  131. label = domains[j - 1]
  132. for month in months:
  133. domainlist.append(senders[month][j])
  134. plt.plot(np.arange(len(months)),
  135. domainlist,
  136. label=label)
  137. plt.legend(frameon=False, loc='best')
  138. plt.xticks(np.arange(len(months)), months, rotation=90)
  139. plt.setp(plt.axes().get_xticklabels(), visible=False)
  140. plt.setp(plt.axes().get_xticklabels()[::6], visible=True)
  141. if args.plotroot:
  142. plt.savefig(args.plotroot + '-destinations.png', bbox_inches='tight')
  143. else:
  144. plt.savefig('email_destinations.png', bbox_inches='tight')