worker thread and cleanup

This commit is contained in:
Alexander
2018-11-28 08:50:35 -06:00
parent e0a34f7ab7
commit 3dc5cc3d91
9 changed files with 168 additions and 195 deletions

156
.idea/workspace.xml generated
View File

@@ -2,8 +2,14 @@
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="7bbe5005-15ef-4d5e-b36b-58084d0f70eb" name="Default Changelist" comment=""> <list default="true" id="7bbe5005-15ef-4d5e-b36b-58084d0f70eb" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/google/gmailWorker.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/HtmlReader.py" beforeDir="false" afterPath="$PROJECT_DIR$/HtmlReader.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/Untitled.ipynb" beforeDir="false" afterPath="$PROJECT_DIR$/Untitled.ipynb" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config.json" beforeDir="false" afterPath="$PROJECT_DIR$/config.json" afterDir="false" /> <change beforePath="$PROJECT_DIR$/config.json" beforeDir="false" afterPath="$PROJECT_DIR$/config.json" afterDir="false" />
<change beforePath="$PROJECT_DIR$/events.csv" beforeDir="false" afterPath="$PROJECT_DIR$/events.csv" afterDir="false" /> <change beforePath="$PROJECT_DIR$/google/gmailApi.py" beforeDir="false" afterPath="$PROJECT_DIR$/google/gmailApi.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/htmlParser.py" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/main.py" beforeDir="false" afterPath="$PROJECT_DIR$/main.py" afterDir="false" />
</list> </list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" /> <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" /> <option name="SHOW_DIALOG" value="false" />
@@ -24,13 +30,13 @@
</usages-collector> </usages-collector>
<usages-collector id="statistics.file.extensions.open"> <usages-collector id="statistics.file.extensions.open">
<counts> <counts>
<entry key="csv" value="1" /> <entry key="csv" value="2" />
<entry key="gitattributes" value="2" /> <entry key="gitattributes" value="2" />
<entry key="gitignore" value="1" /> <entry key="gitignore" value="1" />
<entry key="html" value="13" /> <entry key="html" value="13" />
<entry key="ipynb" value="3" /> <entry key="ipynb" value="3" />
<entry key="json" value="7" /> <entry key="json" value="7" />
<entry key="py" value="12" /> <entry key="py" value="15" />
</counts> </counts>
</usages-collector> </usages-collector>
<usages-collector id="statistics.file.types.open"> <usages-collector id="statistics.file.types.open">
@@ -38,8 +44,8 @@
<entry key="HTML" value="13" /> <entry key="HTML" value="13" />
<entry key="IPNB" value="3" /> <entry key="IPNB" value="3" />
<entry key="JSON" value="7" /> <entry key="JSON" value="7" />
<entry key="PLAIN_TEXT" value="4" /> <entry key="PLAIN_TEXT" value="5" />
<entry key="Python" value="12" /> <entry key="Python" value="15" />
</counts> </counts>
</usages-collector> </usages-collector>
<usages-collector id="statistics.file.extensions.edit"> <usages-collector id="statistics.file.extensions.edit">
@@ -49,7 +55,7 @@
<entry key="gitignore" value="45" /> <entry key="gitignore" value="45" />
<entry key="html" value="35" /> <entry key="html" value="35" />
<entry key="json" value="28" /> <entry key="json" value="28" />
<entry key="py" value="7376" /> <entry key="py" value="9380" />
</counts> </counts>
</usages-collector> </usages-collector>
<usages-collector id="statistics.file.types.edit"> <usages-collector id="statistics.file.types.edit">
@@ -57,56 +63,14 @@
<entry key="HTML" value="35" /> <entry key="HTML" value="35" />
<entry key="JSON" value="28" /> <entry key="JSON" value="28" />
<entry key="PLAIN_TEXT" value="172" /> <entry key="PLAIN_TEXT" value="172" />
<entry key="Python" value="7376" /> <entry key="Python" value="9380" />
</counts> </counts>
</usages-collector> </usages-collector>
<usages-collector id="statistics.vcs.git.usages" /> <usages-collector id="statistics.vcs.git.usages" />
</session> </session>
</component> </component>
<component name="FileEditorManager"> <component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300"> <leaf SIDE_TABS_SIZE_LIMIT_KEY="300" />
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/main.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1690">
<caret line="130" column="23" selection-start-line="130" selection-start-column="23" selection-end-line="130" selection-end-column="23" />
<folding>
<element signature="e#2#38#0" expanded="true" />
<marker date="1542684898017" expanded="true" signature="345:474" ph="..." />
<marker date="1542684898017" expanded="true" signature="1506:2313" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/HtmlReader.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1703">
<caret line="132" column="13" selection-start-line="132" selection-start-column="13" selection-end-line="132" selection-end-column="13" />
<folding>
<element signature="e#2#31#0" expanded="true" />
<marker date="1542684886113" expanded="true" signature="792:941" ph="..." />
<marker date="1542684886113" expanded="true" signature="2066:2075" ph="..." />
<marker date="1542684886113" expanded="true" signature="2066:2635" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/google/gmailApi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="533">
<caret line="49" column="82" selection-start-line="49" selection-start-column="82" selection-end-line="49" selection-end-column="82" />
<folding>
<element signature="e#1#38#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
</component> </component>
<component name="FileTemplateManagerImpl"> <component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES"> <option name="RECENT_TEMPLATES">
@@ -151,10 +115,11 @@
<option value="$PROJECT_DIR$/.gitignore" /> <option value="$PROJECT_DIR$/.gitignore" />
<option value="$PROJECT_DIR$/htmlFiles/OTReport_166a63095fc16625.html" /> <option value="$PROJECT_DIR$/htmlFiles/OTReport_166a63095fc16625.html" />
<option value="$PROJECT_DIR$/.gitattributes" /> <option value="$PROJECT_DIR$/.gitattributes" />
<option value="$PROJECT_DIR$/Untitled.ipynb" />
<option value="$PROJECT_DIR$/google/gmailWorker.py" />
<option value="$PROJECT_DIR$/main.py" />
<option value="$PROJECT_DIR$/google/gmailApi.py" /> <option value="$PROJECT_DIR$/google/gmailApi.py" />
<option value="$PROJECT_DIR$/HtmlReader.py" /> <option value="$PROJECT_DIR$/HtmlReader.py" />
<option value="$PROJECT_DIR$/main.py" />
<option value="$PROJECT_DIR$/Untitled.ipynb" />
</list> </list>
</option> </option>
</component> </component>
@@ -178,7 +143,7 @@
<path> <path>
<item name="ot_report_v2" type="b2602c69:ProjectViewProjectNode" /> <item name="ot_report_v2" type="b2602c69:ProjectViewProjectNode" />
<item name="ot_report_v2" type="462c0819:PsiDirectoryNode" /> <item name="ot_report_v2" type="462c0819:PsiDirectoryNode" />
<item name="htmlFilesv2" type="462c0819:PsiDirectoryNode" /> <item name="google" type="462c0819:PsiDirectoryNode" />
</path> </path>
</expand> </expand>
<select /> <select />
@@ -235,7 +200,7 @@
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" /> <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" /> <window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Version Control" order="7" show_stripe_button="false" /> <window_info anchor="bottom" id="Version Control" order="7" show_stripe_button="false" />
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.39638555" /> <window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.19518073" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" /> <window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" /> <window_info anchor="bottom" id="Python Console" order="10" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" /> <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
@@ -247,9 +212,6 @@
<option name="myLimit" value="2678400000" /> <option name="myLimit" value="2678400000" />
</component> </component>
<component name="editorHistoryManager"> <component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/google/secrets.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/google/config.py"> <entry file="file://$PROJECT_DIR$/google/config.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="26"> <state relative-caret-position="26">
@@ -265,13 +227,6 @@
<entry file="file://$PROJECT_DIR$/token.json"> <entry file="file://$PROJECT_DIR$/token.json">
<provider selected="true" editor-type-id="text-editor" /> <provider selected="true" editor-type-id="text-editor" />
</entry> </entry>
<entry file="file://$PROJECT_DIR$/htmlParser.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="169">
<caret line="13" column="11" selection-start-line="13" selection-start-column="11" selection-end-line="13" selection-end-column="11" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/test.html" /> <entry file="file://$PROJECT_DIR$/test.html" />
<entry file="file://$PROJECT_DIR$/google/Errors.py"> <entry file="file://$PROJECT_DIR$/google/Errors.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
@@ -338,43 +293,72 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/events.csv">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/main.py"> <entry file="file://$PROJECT_DIR$/main.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1690"> <state relative-caret-position="-297">
<caret line="130" column="23" selection-start-line="130" selection-start-column="23" selection-end-line="130" selection-end-column="23" /> <caret line="95" lean-forward="true" selection-start-line="95" selection-end-line="95" />
<folding> <folding>
<element signature="e#2#38#0" expanded="true" /> <element signature="e#2#38#0" expanded="true" />
<marker date="1542684898017" expanded="true" signature="345:474" ph="..." /> <marker date="1543380484795" expanded="true" signature="535:749" ph="..." />
<marker date="1542684898017" expanded="true" signature="1506:2313" ph="..." /> <marker date="1543380484795" expanded="true" signature="2330:3137" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/HtmlReader.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1703">
<caret line="132" column="13" selection-start-line="132" selection-start-column="13" selection-end-line="132" selection-end-column="13" />
<folding>
<element signature="e#2#31#0" expanded="true" />
<marker date="1542684886113" expanded="true" signature="792:941" ph="..." />
<marker date="1542684886113" expanded="true" signature="2066:2075" ph="..." />
<marker date="1542684886113" expanded="true" signature="2066:2635" ph="..." />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/google/gmailApi.py"> <entry file="file://$PROJECT_DIR$/google/gmailApi.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="533"> <state relative-caret-position="520">
<caret line="49" column="82" selection-start-line="49" selection-start-column="82" selection-end-line="49" selection-end-column="82" /> <caret line="40" column="59" selection-start-line="40" selection-start-column="59" selection-end-line="40" selection-end-column="59" />
<folding> <folding>
<element signature="e#1#38#0" expanded="true" /> <element signature="e#1#38#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/google/gmailWorker.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="78">
<caret line="6" column="28" selection-start-line="6" selection-start-column="28" selection-end-line="6" selection-end-column="28" />
<folding>
<element signature="e#0#28#0" expanded="true" />
<marker date="1543379726813" expanded="true" signature="95:100" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/google/secrets.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="13">
<caret line="1" column="20" selection-start-line="1" selection-start-column="20" selection-end-line="1" selection-end-column="20" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/HtmlReader.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="202">
<caret line="108" column="91" selection-start-line="108" selection-start-column="91" selection-end-line="108" selection-end-column="91" />
<folding>
<element signature="e#2#31#0" expanded="true" />
<marker date="1543381166754" expanded="true" signature="792:941" ph="..." />
<marker date="1543381166754" expanded="true" signature="2066:2075" ph="..." />
<marker date="1543381166754" expanded="true" signature="2066:3163" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/htmlParser.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="156">
<caret line="13" column="11" selection-start-line="13" selection-start-column="11" selection-end-line="13" selection-end-column="11" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/events.csv">
<provider selected="true" editor-type-id="text-editor">
<state>
<caret selection-end-column="88" />
</state>
</provider>
</entry>
</component> </component>
</project> </project>

View File

@@ -62,8 +62,8 @@ class HtmlReader:
tbody = workout_summary_table.contents[0] tbody = workout_summary_table.contents[0]
v1_id = tbody.contents[0].contents[0].text
v1_id = tbody.contents[0].contents[0].text
# the OT_REPORT template has changed slightly over the past few months # the OT_REPORT template has changed slightly over the past few months
# the summary is in different places for each type of template # the summary is in different places for each type of template
@@ -100,6 +100,13 @@ class HtmlReader:
# reads all html files in a given directory and scrapes them
# we return the aggregated data as a dictionary called 'event'
# an event has the following keys: calories,splat_pts,steps,date,time,coach,template_version,avg_heart_rate,peak_heart_rate
# these keys are currently found in 3 different sections
# 1. this_class: the THIS CLASS row in the summary table
# 2. summary: data about date, time and coach
# 3. cardio: max and average heart rate data found in the tiles of the email template
def read_all(self, directory): def read_all(self, directory):
files = glob.glob(directory) files = glob.glob(directory)

96
Untitled.ipynb vendored

File diff suppressed because one or more lines are too long

View File

@@ -1 +1 @@
{"last_run": "2018-11-26"} {"last_run": "2018-11-27"}

Binary file not shown.

View File

@@ -38,44 +38,46 @@ class GmailApi:
self.service = build('gmail', 'v1', http=creds.authorize(Http())) self.service = build('gmail', 'v1', http=creds.authorize(Http()))
# get all labels associated with the authenticated user
def get_labels(self): def get_labels(self):
results = self.service.users().labels().list(userId="me").execute() results = self.service.users().labels().list(userId="me").execute()
return results.get('labels', []) return results.get('labels', [])
# query gmail for emails.
# query format is standard gmail search queries
# e.g: 'after: 11/27/2018' -> returns emails that were received after the date given
def get_ot_messages(self, query=''): def get_ot_messages(self, query=''):
no_new_messages = True no_new_messages = True
#gets the ids of all messages that match the OT LabelId and provided query # gets the ids of all messages that match the OT LabelId and provided query
results = self.service.users().messages().list(userId="me", labelIds=[OT_LABEL_ID], q=query).execute() results = self.service.users().messages().list(userId="me", labelIds=[OT_LABEL_ID], q=query).execute()
saved_templates = load_already_parsed_message_ids() saved_templates = load_already_parsed_message_ids()
#if no query is provided we default to pull all data # if no query is provided we default to pull all data
if(query == ''): if query == '':
no_new_messages = False no_new_messages = False
# find out the ids of messages that are saved locally
#find out the ids of messages that are saved locally
for result in results['messages']: for result in results['messages']:
if(result['id'] not in saved_templates): if result['id'] not in saved_templates:
no_new_messages = False no_new_messages = False
# if no new messages are found in any case raise error to catch accordingly
#if no new messages are found in any case raise error to catch accordingly if results['resultSizeEstimate'] == 0 or no_new_messages:
if(results['resultSizeEstimate'] == 0 or no_new_messages):
raise Errors.NoMessagesFoundException(userId='me', labelIds=[OT_LABEL_ID], q=query) raise Errors.NoMessagesFoundException(userId='me', labelIds=[OT_LABEL_ID], q=query)
log_msg = "Found {} new OT Email(s).".format(len(results["messages"])) log_msg = "Found {} new OT Email(s).".format(len(results["messages"]))
print(log_msg) print(log_msg)
return results["messages"] return results["messages"]
# get a single message by message_id
def get_message(self, message_id): def get_message(self, message_id):
m_res = self.service.users().messages().get(id=message_id, userId='me').execute() m_res = self.service.users().messages().get(id=message_id, userId='me').execute()

20
google/gmailWorker.py Normal file
View File

@@ -0,0 +1,20 @@
from threading import Thread
from google.gmailApi import GmailApi
class GmailWorker(Thread):
def __init__(self, queue):
Thread.__init__(self)
self.queue = queue
self.gmail = GmailApi()
def run(self):
while True:
message_id,parse = self.queue.get()
content = self.gmail.get_message(message_id)
parse(content,message_id)
self.queue.task_done()

View File

@@ -1,20 +0,0 @@
from html.parser import HTMLParser
from bs4 import BeautifulSoup
# class MyHTMLParser(HTMLParser):
# def handle_starttag(self, tag, attrs):
# print("Encountered a start tag:", tag)
# def handle_endtag(self, tag):
# print("Encountered an end tag :", tag)
# def handle_data(self, data):
# print("Encountered some data :", data)
with open('test.html') as file:
soup = BeautifulSoup(file)
print(soup.prettify())

32
main.py
View File

@@ -2,7 +2,9 @@
from google.gmailApi import GmailApi from google.gmailApi import GmailApi
from google.Errors import NoMessagesFoundException from google.Errors import NoMessagesFoundException
from google.gmailWorker import GmailWorker
from pytime import pytime from pytime import pytime
from queue import Queue
import HtmlReader import HtmlReader
import base64 import base64
import json import json
@@ -10,6 +12,9 @@ import sys
import csv import csv
#get the config
#currently only returns a json object with a single key
#the key will tell us when we last ran the program
def get_config(): def get_config():
config_file = open('config.json') config_file = open('config.json')
config = json.load(config_file) config = json.load(config_file)
@@ -24,7 +29,8 @@ def save_config(config):
config_file.close() config_file.close()
#write the file to disk.
#name is generally of the format 'OTReport_<messageid>.html'
def write_to_html_file(html, name): def write_to_html_file(html, name):
path = "./htmlFilesv2/{}".format(name) path = "./htmlFilesv2/{}".format(name)
file = open(path, "w") file = open(path, "w")
@@ -33,6 +39,7 @@ def write_to_html_file(html, name):
file.close() file.close()
#parses the raw byte content of an OT Email and writes it to ./htmlFilesv2 as an html file
def parse_message(msg_json, msg_id): def parse_message(msg_json, msg_id):
message_parts = msg_json["payload"]["parts"] message_parts = msg_json["payload"]["parts"]
@@ -44,24 +51,40 @@ def parse_message(msg_json, msg_id):
file_name = 'OTReport_{}.html'.format(msg_id) file_name = 'OTReport_{}.html'.format(msg_id)
write_to_html_file(result, file_name) write_to_html_file(result, file_name)
def get_last_run_time(timestamp): def get_last_run_time(timestamp):
yesterday = str(pytime.before(timestamp, '1d')).split(' ')[0] yesterday = str(pytime.before(timestamp, '1d')).split(' ')[0]
return yesterday return yesterday
#pulls the gmail data
#creates 4 worker threads to speed up the download and parsing of emails
def pull_gmail_data(query=''): def pull_gmail_data(query=''):
gmail = GmailApi() gmail = GmailApi()
queue = Queue()
messages = gmail.get_ot_messages(query) messages = gmail.get_ot_messages(query)
#start 4 worker threads to speed up the download and parsing of emails
for x in range(4):
print('starting worker')
worker = GmailWorker(queue)
worker.daemon = True
worker.start()
#push task into the queue as a tuple
#second item in the task tuple is always the parse_message function defined above
for message in messages: for message in messages:
id = message['id'] id = message['id']
m_res = gmail.get_message(id) queue.put((id,parse_message))
parse_message(m_res, id)
queue.join() #wait
def do_latest(config): def do_latest(config):
last_run = get_last_run_time(config['last_run']) last_run = get_last_run_time(config['last_run'])
@@ -69,6 +92,8 @@ def do_latest(config):
pull_gmail_data(query) pull_gmail_data(query)
def write_to_csv(events): def write_to_csv(events):
with open('events.csv', 'w', newline='') as csvFile: with open('events.csv', 'w', newline='') as csvFile:
field_names = ['calories', 'splat_pts', 'steps', 'date', 'time', 'coach', 'template_version', 'avg_heart_rate', 'peak_heart_rate'] field_names = ['calories', 'splat_pts', 'steps', 'date', 'time', 'coach', 'template_version', 'avg_heart_rate', 'peak_heart_rate']
@@ -110,6 +135,7 @@ def main():
last_run = get_last_run_time(config['last_run']) last_run = get_last_run_time(config['last_run'])
query = 'after:{}'.format(last_run) query = 'after:{}'.format(last_run)
#try to find data #try to find data
try: try:
pull_gmail_data(query) pull_gmail_data(query)