worker thread and cleanup

This commit is contained in:
Alexander
2018-11-28 08:50:35 -06:00
parent e0a34f7ab7
commit 3dc5cc3d91
9 changed files with 168 additions and 195 deletions

156
.idea/workspace.xml generated
View File

@@ -2,8 +2,14 @@
<project version="4">
<component name="ChangeListManager">
<list default="true" id="7bbe5005-15ef-4d5e-b36b-58084d0f70eb" name="Default Changelist" comment="">
<change afterPath="$PROJECT_DIR$/google/gmailWorker.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/HtmlReader.py" beforeDir="false" afterPath="$PROJECT_DIR$/HtmlReader.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/Untitled.ipynb" beforeDir="false" afterPath="$PROJECT_DIR$/Untitled.ipynb" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config.json" beforeDir="false" afterPath="$PROJECT_DIR$/config.json" afterDir="false" />
<change beforePath="$PROJECT_DIR$/events.csv" beforeDir="false" afterPath="$PROJECT_DIR$/events.csv" afterDir="false" />
<change beforePath="$PROJECT_DIR$/google/gmailApi.py" beforeDir="false" afterPath="$PROJECT_DIR$/google/gmailApi.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/htmlParser.py" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/main.py" beforeDir="false" afterPath="$PROJECT_DIR$/main.py" afterDir="false" />
</list>
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
@@ -24,13 +30,13 @@
</usages-collector>
<usages-collector id="statistics.file.extensions.open">
<counts>
<entry key="csv" value="1" />
<entry key="csv" value="2" />
<entry key="gitattributes" value="2" />
<entry key="gitignore" value="1" />
<entry key="html" value="13" />
<entry key="ipynb" value="3" />
<entry key="json" value="7" />
<entry key="py" value="12" />
<entry key="py" value="15" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.open">
@@ -38,8 +44,8 @@
<entry key="HTML" value="13" />
<entry key="IPNB" value="3" />
<entry key="JSON" value="7" />
<entry key="PLAIN_TEXT" value="4" />
<entry key="Python" value="12" />
<entry key="PLAIN_TEXT" value="5" />
<entry key="Python" value="15" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.edit">
@@ -49,7 +55,7 @@
<entry key="gitignore" value="45" />
<entry key="html" value="35" />
<entry key="json" value="28" />
<entry key="py" value="7376" />
<entry key="py" value="9380" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.edit">
@@ -57,56 +63,14 @@
<entry key="HTML" value="35" />
<entry key="JSON" value="28" />
<entry key="PLAIN_TEXT" value="172" />
<entry key="Python" value="7376" />
<entry key="Python" value="9380" />
</counts>
</usages-collector>
<usages-collector id="statistics.vcs.git.usages" />
</session>
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/main.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1690">
<caret line="130" column="23" selection-start-line="130" selection-start-column="23" selection-end-line="130" selection-end-column="23" />
<folding>
<element signature="e#2#38#0" expanded="true" />
<marker date="1542684898017" expanded="true" signature="345:474" ph="..." />
<marker date="1542684898017" expanded="true" signature="1506:2313" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/HtmlReader.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1703">
<caret line="132" column="13" selection-start-line="132" selection-start-column="13" selection-end-line="132" selection-end-column="13" />
<folding>
<element signature="e#2#31#0" expanded="true" />
<marker date="1542684886113" expanded="true" signature="792:941" ph="..." />
<marker date="1542684886113" expanded="true" signature="2066:2075" ph="..." />
<marker date="1542684886113" expanded="true" signature="2066:2635" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/google/gmailApi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="533">
<caret line="49" column="82" selection-start-line="49" selection-start-column="82" selection-end-line="49" selection-end-column="82" />
<folding>
<element signature="e#1#38#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300" />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
@@ -151,10 +115,11 @@
<option value="$PROJECT_DIR$/.gitignore" />
<option value="$PROJECT_DIR$/htmlFiles/OTReport_166a63095fc16625.html" />
<option value="$PROJECT_DIR$/.gitattributes" />
<option value="$PROJECT_DIR$/Untitled.ipynb" />
<option value="$PROJECT_DIR$/google/gmailWorker.py" />
<option value="$PROJECT_DIR$/main.py" />
<option value="$PROJECT_DIR$/google/gmailApi.py" />
<option value="$PROJECT_DIR$/HtmlReader.py" />
<option value="$PROJECT_DIR$/main.py" />
<option value="$PROJECT_DIR$/Untitled.ipynb" />
</list>
</option>
</component>
@@ -178,7 +143,7 @@
<path>
<item name="ot_report_v2" type="b2602c69:ProjectViewProjectNode" />
<item name="ot_report_v2" type="462c0819:PsiDirectoryNode" />
<item name="htmlFilesv2" type="462c0819:PsiDirectoryNode" />
<item name="google" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
@@ -235,7 +200,7 @@
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Version Control" order="7" show_stripe_button="false" />
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.39638555" />
<window_info active="true" anchor="bottom" id="Terminal" order="8" visible="true" weight="0.19518073" />
<window_info anchor="bottom" id="Event Log" order="9" side_tool="true" />
<window_info anchor="bottom" id="Python Console" order="10" />
<window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
@@ -247,9 +212,6 @@
<option name="myLimit" value="2678400000" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/google/secrets.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/google/config.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="26">
@@ -265,13 +227,6 @@
<entry file="file://$PROJECT_DIR$/token.json">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/htmlParser.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="169">
<caret line="13" column="11" selection-start-line="13" selection-start-column="11" selection-end-line="13" selection-end-column="11" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/test.html" />
<entry file="file://$PROJECT_DIR$/google/Errors.py">
<provider selected="true" editor-type-id="text-editor">
@@ -338,43 +293,72 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/events.csv">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/main.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1690">
<caret line="130" column="23" selection-start-line="130" selection-start-column="23" selection-end-line="130" selection-end-column="23" />
<state relative-caret-position="-297">
<caret line="95" lean-forward="true" selection-start-line="95" selection-end-line="95" />
<folding>
<element signature="e#2#38#0" expanded="true" />
<marker date="1542684898017" expanded="true" signature="345:474" ph="..." />
<marker date="1542684898017" expanded="true" signature="1506:2313" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/HtmlReader.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1703">
<caret line="132" column="13" selection-start-line="132" selection-start-column="13" selection-end-line="132" selection-end-column="13" />
<folding>
<element signature="e#2#31#0" expanded="true" />
<marker date="1542684886113" expanded="true" signature="792:941" ph="..." />
<marker date="1542684886113" expanded="true" signature="2066:2075" ph="..." />
<marker date="1542684886113" expanded="true" signature="2066:2635" ph="..." />
<marker date="1543380484795" expanded="true" signature="535:749" ph="..." />
<marker date="1543380484795" expanded="true" signature="2330:3137" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/google/gmailApi.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="533">
<caret line="49" column="82" selection-start-line="49" selection-start-column="82" selection-end-line="49" selection-end-column="82" />
<state relative-caret-position="520">
<caret line="40" column="59" selection-start-line="40" selection-start-column="59" selection-end-line="40" selection-end-column="59" />
<folding>
<element signature="e#1#38#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/google/gmailWorker.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="78">
<caret line="6" column="28" selection-start-line="6" selection-start-column="28" selection-end-line="6" selection-end-column="28" />
<folding>
<element signature="e#0#28#0" expanded="true" />
<marker date="1543379726813" expanded="true" signature="95:100" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/google/secrets.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="13">
<caret line="1" column="20" selection-start-line="1" selection-start-column="20" selection-end-line="1" selection-end-column="20" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/HtmlReader.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="202">
<caret line="108" column="91" selection-start-line="108" selection-start-column="91" selection-end-line="108" selection-end-column="91" />
<folding>
<element signature="e#2#31#0" expanded="true" />
<marker date="1543381166754" expanded="true" signature="792:941" ph="..." />
<marker date="1543381166754" expanded="true" signature="2066:2075" ph="..." />
<marker date="1543381166754" expanded="true" signature="2066:3163" ph="..." />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/htmlParser.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="156">
<caret line="13" column="11" selection-start-line="13" selection-start-column="11" selection-end-line="13" selection-end-column="11" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/events.csv">
<provider selected="true" editor-type-id="text-editor">
<state>
<caret selection-end-column="88" />
</state>
</provider>
</entry>
</component>
</project>

View File

@@ -62,8 +62,8 @@ class HtmlReader:
tbody = workout_summary_table.contents[0]
v1_id = tbody.contents[0].contents[0].text
v1_id = tbody.contents[0].contents[0].text
# the OT_REPORT template has changed slightly over the past few months
# the summary is in different places for each type of template
@@ -100,6 +100,13 @@ class HtmlReader:
# reads all html files in a given directory and scrapes them
# we return the aggregated data as a dictionary called 'event'
# an event has the following keys: calories,splat_pts,steps,date,time,coach,template_version,avg_heart_rate,peak_heart_rate
# these keys are currently found in 3 different sections
# 1. this_class: the THIS CLASS row in the summary table
# 2. summary: data about date, time and coach
# 3. cardio: max and average heart rate data found in the tiles of the email template
def read_all(self, directory):
files = glob.glob(directory)

100
Untitled.ipynb vendored

File diff suppressed because one or more lines are too long

View File

@@ -1 +1 @@
{"last_run": "2018-11-26"}
{"last_run": "2018-11-27"}

Binary file not shown.

View File

@@ -38,44 +38,46 @@ class GmailApi:
self.service = build('gmail', 'v1', http=creds.authorize(Http()))
# get all labels associated with the authenticated user
def get_labels(self):
results = self.service.users().labels().list(userId="me").execute()
return results.get('labels', [])
# query gmail for emails.
# query format is standard gmail search queries
# e.g: 'after: 11/27/2018' -> returns emails that were received after the date given
def get_ot_messages(self, query=''):
no_new_messages = True
#gets the ids of all messages that match the OT LabelId and provided query
# gets the ids of all messages that match the OT LabelId and provided query
results = self.service.users().messages().list(userId="me", labelIds=[OT_LABEL_ID], q=query).execute()
saved_templates = load_already_parsed_message_ids()
#if no query is provided we default to pull all data
if(query == ''):
# if no query is provided we default to pull all data
if query == '':
no_new_messages = False
#find out the ids of messages that are saved locally
# find out the ids of messages that are saved locally
for result in results['messages']:
if(result['id'] not in saved_templates):
if result['id'] not in saved_templates:
no_new_messages = False
#if no new messages are found in any case raise error to catch accordingly
if(results['resultSizeEstimate'] == 0 or no_new_messages):
# if no new messages are found in any case raise error to catch accordingly
if results['resultSizeEstimate'] == 0 or no_new_messages:
raise Errors.NoMessagesFoundException(userId='me', labelIds=[OT_LABEL_ID], q=query)
log_msg = "Found {} new OT Email(s).".format(len(results["messages"]))
print(log_msg)
return results["messages"]
# get a single message by message_id
def get_message(self, message_id):
m_res = self.service.users().messages().get(id=message_id, userId='me').execute()

20
google/gmailWorker.py Normal file
View File

@@ -0,0 +1,20 @@
from threading import Thread
from google.gmailApi import GmailApi
class GmailWorker(Thread):
def __init__(self, queue):
Thread.__init__(self)
self.queue = queue
self.gmail = GmailApi()
def run(self):
while True:
message_id,parse = self.queue.get()
content = self.gmail.get_message(message_id)
parse(content,message_id)
self.queue.task_done()

View File

@@ -1,20 +0,0 @@
from html.parser import HTMLParser
from bs4 import BeautifulSoup
# class MyHTMLParser(HTMLParser):
# def handle_starttag(self, tag, attrs):
# print("Encountered a start tag:", tag)
# def handle_endtag(self, tag):
# print("Encountered an end tag :", tag)
# def handle_data(self, data):
# print("Encountered some data :", data)
with open('test.html') as file:
soup = BeautifulSoup(file)
print(soup.prettify())

32
main.py
View File

@@ -2,7 +2,9 @@
from google.gmailApi import GmailApi
from google.Errors import NoMessagesFoundException
from google.gmailWorker import GmailWorker
from pytime import pytime
from queue import Queue
import HtmlReader
import base64
import json
@@ -10,6 +12,9 @@ import sys
import csv
#get the config
#currently only returns a json object with a single key
#the key will tell us when we last ran the program
def get_config():
config_file = open('config.json')
config = json.load(config_file)
@@ -24,7 +29,8 @@ def save_config(config):
config_file.close()
#write the file to disk.
#name is generally of the format 'OTReport_<messageid>.html'
def write_to_html_file(html, name):
path = "./htmlFilesv2/{}".format(name)
file = open(path, "w")
@@ -33,6 +39,7 @@ def write_to_html_file(html, name):
file.close()
#parses the raw byte content of an OT Email and writes it to ./htmlFilesv2 as an html file
def parse_message(msg_json, msg_id):
message_parts = msg_json["payload"]["parts"]
@@ -44,24 +51,40 @@ def parse_message(msg_json, msg_id):
file_name = 'OTReport_{}.html'.format(msg_id)
write_to_html_file(result, file_name)
def get_last_run_time(timestamp):
yesterday = str(pytime.before(timestamp, '1d')).split(' ')[0]
return yesterday
#pulls the gmail data
#creates 4 worker threads to speed up the download and parsing of emails
def pull_gmail_data(query=''):
gmail = GmailApi()
queue = Queue()
messages = gmail.get_ot_messages(query)
#start 4 worker threads to speed up the download and parsing of emails
for x in range(4):
print('starting worker')
worker = GmailWorker(queue)
worker.daemon = True
worker.start()
#push task into the queue as a tuple
#second item in the task tuple is always the parse_message function defined above
for message in messages:
id = message['id']
m_res = gmail.get_message(id)
queue.put((id,parse_message))
parse_message(m_res, id)
queue.join() #wait
def do_latest(config):
last_run = get_last_run_time(config['last_run'])
@@ -69,6 +92,8 @@ def do_latest(config):
pull_gmail_data(query)
def write_to_csv(events):
with open('events.csv', 'w', newline='') as csvFile:
field_names = ['calories', 'splat_pts', 'steps', 'date', 'time', 'coach', 'template_version', 'avg_heart_rate', 'peak_heart_rate']
@@ -110,6 +135,7 @@ def main():
last_run = get_last_run_time(config['last_run'])
query = 'after:{}'.format(last_run)
#try to find data
try:
pull_gmail_data(query)