diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 0d2a6c2..7780bd8 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,8 +2,14 @@
+
+
+
+
-
+
+
+
@@ -24,13 +30,13 @@
-
+
-
+
@@ -38,8 +44,8 @@
-
-
+
+
@@ -49,7 +55,7 @@
-
+
@@ -57,56 +63,14 @@
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
-
-
@@ -178,7 +143,7 @@
-
+
@@ -235,7 +200,7 @@
-
+
@@ -247,9 +212,6 @@
-
-
-
@@ -265,13 +227,6 @@
-
-
-
-
-
-
-
@@ -338,43 +293,72 @@
-
-
-
-
-
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/HtmlReader.py b/HtmlReader.py
index 52b98c9..e687599 100644
--- a/HtmlReader.py
+++ b/HtmlReader.py
@@ -62,8 +62,8 @@ class HtmlReader:
tbody = workout_summary_table.contents[0]
- v1_id = tbody.contents[0].contents[0].text
+ v1_id = tbody.contents[0].contents[0].text
# the OT_REPORT template has changed slightly over the past few months
# the summary is in different places for each type of template
@@ -100,6 +100,13 @@ class HtmlReader:
+ # reads all html files in a given directory and scrapes them
+ # we return the aggregated data as a dictionary called 'event'
+ # an event has the following keys: calories,splat_pts,steps,date,time,coach,template_version,avg_heart_rate,peak_heart_rate
+ # these keys are currently found in 3 different sections
+ # 1. this_class: the THIS CLASS row in the summary table
+ # 2. summary: data about date, time and coach
+ # 3. cardio: max and average heart rate data found in the tiles of the email template
def read_all(self, directory):
files = glob.glob(directory)
diff --git a/Untitled.ipynb b/Untitled.ipynb
index 5878ac9..04a5a35 100644
--- a/Untitled.ipynb
+++ b/Untitled.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 89,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
- "execution_count": 257,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -37,7 +37,7 @@
},
{
"cell_type": "code",
- "execution_count": 258,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -153,7 +153,7 @@
"4 159 191 "
]
},
- "execution_count": 258,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -204,7 +204,7 @@
},
{
"cell_type": "code",
- "execution_count": 259,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -213,7 +213,7 @@
"array(['Shelby', 'Kevin', 'Stephen', 'Kate'], dtype=object)"
]
},
- "execution_count": 259,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -224,7 +224,7 @@
},
{
"cell_type": "code",
- "execution_count": 260,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -284,64 +284,19 @@
},
{
"cell_type": "code",
- "execution_count": 262,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
calories
\n",
- "
coach
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
Kevin
\n",
- "
777.400000
\n",
- "
Kevin
\n",
- "
\n",
- "
\n",
- "
Shelby
\n",
- "
804.900000
\n",
- "
Shelby
\n",
- "
\n",
- "
\n",
- "
Stephen
\n",
- "
810.666667
\n",
- "
Stephen
\n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " calories coach\n",
- "Kevin 777.400000 Kevin\n",
- "Shelby 804.900000 Shelby\n",
- "Stephen 810.666667 Stephen"
- ]
- },
- "execution_count": 262,
- "metadata": {},
- "output_type": "execute_result"
+ "ename": "NameError",
+ "evalue": "name 'cal_df' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcal_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'calories'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m: name 'cal_df' is not defined"
+ ]
}
],
"source": [
@@ -387,20 +342,19 @@
},
{
"cell_type": "code",
- "execution_count": 264,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
+ "ename": "NameError",
+ "evalue": "name 'ot_df_cpy' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mot_df_cpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'date'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'avg_heart_rate'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'line'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m: name 'ot_df_cpy' is not defined"
+ ]
}
],
"source": [
diff --git a/config.json b/config.json
index 14ced0d..0affc76 100644
--- a/config.json
+++ b/config.json
@@ -1 +1 @@
-{"last_run": "2018-11-26"}
\ No newline at end of file
+{"last_run": "2018-11-27"}
\ No newline at end of file
diff --git a/google/__pycache__/gmailWorker.cpython-37.pyc b/google/__pycache__/gmailWorker.cpython-37.pyc
new file mode 100644
index 0000000..0e17220
Binary files /dev/null and b/google/__pycache__/gmailWorker.cpython-37.pyc differ
diff --git a/google/gmailApi.py b/google/gmailApi.py
index 8053afb..a3fdaa4 100644
--- a/google/gmailApi.py
+++ b/google/gmailApi.py
@@ -38,44 +38,46 @@ class GmailApi:
self.service = build('gmail', 'v1', http=creds.authorize(Http()))
+ # get all labels associated with the authenticated user
def get_labels(self):
results = self.service.users().labels().list(userId="me").execute()
return results.get('labels', [])
+
+ # query gmail for emails.
+ # query format is standard gmail search queries
+ # e.g: 'after: 11/27/2018' -> returns emails that were received after the date given
def get_ot_messages(self, query=''):
no_new_messages = True
- #gets the ids of all messages that match the OT LabelId and provided query
+ # gets the ids of all messages that match the OT LabelId and provided query
results = self.service.users().messages().list(userId="me", labelIds=[OT_LABEL_ID], q=query).execute()
saved_templates = load_already_parsed_message_ids()
- #if no query is provided we default to pull all data
- if(query == ''):
+ # if no query is provided we default to pull all data
+ if query == '':
no_new_messages = False
-
- #find out the ids of messages that are saved locally
+ # find out the ids of messages that are saved locally
for result in results['messages']:
- if(result['id'] not in saved_templates):
+ if result['id'] not in saved_templates:
no_new_messages = False
-
- #if no new messages are found in any case raise error to catch accordingly
- if(results['resultSizeEstimate'] == 0 or no_new_messages):
+ # if no new messages are found in any case raise error to catch accordingly
+ if results['resultSizeEstimate'] == 0 or no_new_messages:
raise Errors.NoMessagesFoundException(userId='me', labelIds=[OT_LABEL_ID], q=query)
-
log_msg = "Found {} new OT Email(s).".format(len(results["messages"]))
print(log_msg)
-
return results["messages"]
+ # get a single message by message_id
def get_message(self, message_id):
m_res = self.service.users().messages().get(id=message_id, userId='me').execute()
diff --git a/google/gmailWorker.py b/google/gmailWorker.py
new file mode 100644
index 0000000..32dfdaf
--- /dev/null
+++ b/google/gmailWorker.py
@@ -0,0 +1,20 @@
+from threading import Thread
+from google.gmailApi import GmailApi
+
+
+
+class GmailWorker(Thread):
+ def __init__(self, queue):
+ Thread.__init__(self)
+ self.queue = queue
+ self.gmail = GmailApi()
+
+ def run(self):
+ while True:
+ message_id,parse = self.queue.get()
+
+ content = self.gmail.get_message(message_id)
+
+ parse(content,message_id)
+
+ self.queue.task_done()
diff --git a/htmlParser.py b/htmlParser.py
deleted file mode 100644
index 696035e..0000000
--- a/htmlParser.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from html.parser import HTMLParser
-from bs4 import BeautifulSoup
-
-# class MyHTMLParser(HTMLParser):
-# def handle_starttag(self, tag, attrs):
-# print("Encountered a start tag:", tag)
-
-# def handle_endtag(self, tag):
-# print("Encountered an end tag :", tag)
-
-# def handle_data(self, data):
-# print("Encountered some data :", data)
-
-with open('test.html') as file:
- soup = BeautifulSoup(file)
- print(soup.prettify())
-
-
-
-
diff --git a/main.py b/main.py
index b4fcf62..0149753 100644
--- a/main.py
+++ b/main.py
@@ -2,7 +2,9 @@
from google.gmailApi import GmailApi
from google.Errors import NoMessagesFoundException
+from google.gmailWorker import GmailWorker
from pytime import pytime
+from queue import Queue
import HtmlReader
import base64
import json
@@ -10,6 +12,9 @@ import sys
import csv
+#get the config
+#currently only returns a json object with a single key
+#the key will tell us when we last ran the program
def get_config():
config_file = open('config.json')
config = json.load(config_file)
@@ -24,7 +29,8 @@ def save_config(config):
config_file.close()
-
+#write the file to disk.
+#name is generally of the format 'OTReport_.html'
def write_to_html_file(html, name):
path = "./htmlFilesv2/{}".format(name)
file = open(path, "w")
@@ -33,6 +39,7 @@ def write_to_html_file(html, name):
file.close()
+#parses the raw byte content of an OT Email and writes it to ./htmlFilesv2 as an html file
def parse_message(msg_json, msg_id):
message_parts = msg_json["payload"]["parts"]
@@ -44,24 +51,40 @@ def parse_message(msg_json, msg_id):
file_name = 'OTReport_{}.html'.format(msg_id)
write_to_html_file(result, file_name)
+
def get_last_run_time(timestamp):
yesterday = str(pytime.before(timestamp, '1d')).split(' ')[0]
return yesterday
+
+#pulls the gmail data
+#creates 4 worker threads to speed up the download and parsing of emails
def pull_gmail_data(query=''):
gmail = GmailApi()
+ queue = Queue()
messages = gmail.get_ot_messages(query)
+ #start 4 worker threads to speed up the download and parsing of emails
+ for x in range(4):
+ print('starting worker')
+ worker = GmailWorker(queue)
+ worker.daemon = True
+ worker.start()
+
+
+ #push task into the queue as a tuple
+ #second item in the task tuple is always the parse_message function defined above
for message in messages:
id = message['id']
- m_res = gmail.get_message(id)
+ queue.put((id,parse_message))
- parse_message(m_res, id)
+
+ queue.join() #wait
def do_latest(config):
last_run = get_last_run_time(config['last_run'])
@@ -69,6 +92,8 @@ def do_latest(config):
pull_gmail_data(query)
+
+
def write_to_csv(events):
with open('events.csv', 'w', newline='') as csvFile:
field_names = ['calories', 'splat_pts', 'steps', 'date', 'time', 'coach', 'template_version', 'avg_heart_rate', 'peak_heart_rate']
@@ -110,6 +135,7 @@ def main():
last_run = get_last_run_time(config['last_run'])
query = 'after:{}'.format(last_run)
+
#try to find data
try:
pull_gmail_data(query)