email_stats.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. #!/usr/bin/python
  2. #
  3. # email_stats.py
  4. #
  5. # Analytics on Email mbox files
  6. # - Time vs date plots of emails
  7. # - Recipients domains as a function of time
  8. #
  9. # Copyright 2015-2017 George C. Privon
  10. #
  11. # This program is free software: you can redistribute it and/or modify
  12. # it under the terms of the GNU General Public License as published by
  13. # the Free Software Foundation, either version 3 of the License, or
  14. # (at your option) any later version.
  15. #
  16. # This program is distributed in the hope that it will be useful,
  17. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. # GNU General Public License for more details.
  20. #
  21. # You should have received a copy of the GNU General Public License
  22. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. import mailbox
  24. import datetime
  25. import matplotlib.pyplot as plt
  26. import argparse
  27. import re
  28. import numpy as np
  29. import cubehelix
  30. parser = argparse.ArgumentParser()
  31. parser.add_argument('mbox', help='Mailbox to analyze.')
  32. parser.add_argument('--plotroot', '-p', default='email_stats', action='store',
  33. help='Root name for output plots.')
  34. parser.add_argument('--title', '-t', default='',
  35. action='store', help='Plot title root.')
  36. parser.add_argument('--sendercolors', '-s', default=False, action='store',
  37. help='Comma separated list of search strings for the \
  38. sender field. Each will be displayed with a \
  39. different color.')
  40. args = parser.parse_args()
  41. pldata = {}
  42. if args.sendercolors:
  43. slist = args.sendercolors.split(',')
  44. nsend = len(slist)
  45. for item in slist:
  46. pldata[item] = []
  47. pldata['unknown'] = []
  48. domains = ['gmail',
  49. 'hotmail',
  50. 'aol',
  51. 'comcast',
  52. 'yahoo',
  53. 'privon']
  54. senders = {}
  55. a = mailbox.mbox(args.mbox)
  56. for msg in a:
  57. cid = None
  58. label = None
  59. if msg['date'] is not None:
  60. try:
  61. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %z')
  62. except ValueError:
  63. try:
  64. z = datetime.datetime.strptime(msg['date'], '%a, %d %b %Y %H:%M:%S %Z')
  65. except ValueError:
  66. print("Skipping message from " + msg['date'])
  67. continue
  68. dateID = r"${0:1d}-{1:02d}$".format(z.year, z.month)
  69. if dateID not in senders.keys():
  70. senders[dateID] = np.zeros(len(domains) + 1)
  71. dmatch = False
  72. for i, domain in enumerate(domains):
  73. if not(msg['To'] is None) and re.search(domain, msg['To']):
  74. senders[dateID][i+1] += 1
  75. dmatch = True
  76. if not dmatch:
  77. senders[dateID][0] += 1
  78. if args.sendercolors:
  79. for search in enumerate(slist):
  80. if re.search(search[1], msg['From'], re.IGNORECASE):
  81. cid, label = search
  82. pldata[label].append([z.date(), z.hour + z.minute/60.])
  83. break
  84. if cid is None:
  85. pldata['unknown'].append([z.date(), z.hour + z.minute/60.])
  86. # Email send times as a function of day
  87. plt.figure()
  88. plt.ylim([0, 24])
  89. plt.yticks(4*np.arange(7))
  90. plt.ylabel('Hour', fontsize='large')
  91. plt.xlabel('Date', fontsize='large')
  92. plt.minorticks_on()
  93. plt.title(args.title + ' - Email Send Times', fontsize='large')
  94. scolor = cubehelix.cmap(startHue=240, endHue=-300,
  95. minSat=1, maxSat=2.5,
  96. minLight=.3, maxLight=.8,
  97. gamma=.9)
  98. if args.sendercolors:
  99. for plid in enumerate(slist):
  100. plt.plot_date(np.array(pldata[plid[1]])[:, 0],
  101. np.array(pldata[plid[1]])[:, 1],
  102. color=scolor(plid[0] / (nsend + 1)),
  103. marker='.',
  104. #tz=z.tzname(),
  105. label=plid[1],
  106. xdate=True)
  107. if len(pldata['unknown']) > 0:
  108. plt.plot_date(np.array(pldata['unknown'])[:, 0],
  109. np.array(pldata['unknown'])[:, 1],
  110. color=scolor(1),
  111. marker='.',
  112. #tz=z.tzname(),
  113. label='unknown',
  114. xdate=True)
  115. if args.sendercolors:
  116. plt.legend(loc='upper left', ncol=nsend + 1)
  117. if args.plotroot:
  118. plt.savefig(args.plotroot + '-send_times.png')
  119. else:
  120. plt.savefig('send_times.png')
  121. # Email destination domains as a function of month
  122. plt.figure()
  123. plt.ylabel(r'Emails sent to Domain', fontsize='large')
  124. plt.xlabel(r'Year$-$Month', fontsize='large')
  125. plt.minorticks_on()
  126. plt.title(args.title, fontsize='large')
  127. months = list(senders.keys())
  128. months.sort()
  129. for j in range(len(domains) + 1):
  130. domainlist = []
  131. if j == 0:
  132. label = 'Other'
  133. else:
  134. label = domains[j - 1]
  135. for month in months:
  136. domainlist.append(senders[month][j])
  137. plt.plot(np.arange(len(months)),
  138. domainlist,
  139. label=label)
  140. plt.legend(frameon=False, loc='best')
  141. plt.xticks(np.arange(len(months)), months, rotation=90)
  142. plt.setp(plt.axes().get_xticklabels(), visible=False)
  143. plt.setp(plt.axes().get_xticklabels()[::6], visible=True)
  144. plt.ylim(bottom=0)
  145. if args.plotroot:
  146. plt.savefig(args.plotroot + '-destinations.png', bbox_inches='tight')
  147. else:
  148. plt.savefig('email_destinations.png', bbox_inches='tight')