fix: worker listener retires (#181)

* hotfix: add repository for npm publish

* release(py-sdk): bump version

* chore: ignore venv

* fix: add timeout to retry

* chore: rm change

* fix: remove duplicate logic and favor retries in dispatcher

* release: bump version

* fix: retry on general failures

* chore: rm unused code

* fix: retries reset if greater than interval
This commit is contained in:
Gabe Ruttner
2024-02-21 18:54:42 -08:00
committed by GitHub
parent e44c3a1290
commit a7cff7f218
4 changed files with 39 additions and 30 deletions
+34 -19
View File
@@ -9,6 +9,7 @@ import json
import grpc
from typing import Callable, List, Union
from ..metadata import get_metadata
import time
def new_dispatcher(conn, config: ClientConfig):
@@ -24,11 +25,13 @@ class DispatcherClient:
def send_step_action_event(self, ctx, in_):
raise NotImplementedError
DEFAULT_ACTION_LISTENER_RETRY_INTERVAL = 1 # seconds
DEFAULT_ACTION_LISTENER_RETRY_COUNT = 5
DEFAULT_ACTION_LISTENER_RETRY_INTERVAL = 5 # seconds
DEFAULT_ACTION_LISTENER_RETRY_COUNT = 15
DEFAULT_ACTION_TIMEOUT = 60 # seconds
DEFAULT_REGISTER_TIMEOUT = 5
class GetActionListenerRequest:
def __init__(self, worker_name: str, services: List[str], actions: List[str]):
self.worker_name = worker_name
@@ -67,17 +70,19 @@ class ActionListenerImpl(WorkerActionListener):
self.token = token
self.worker_id = worker_id
self.retries = 0
self.last_connection_attempt = 0
# self.logger = logger
# self.validator = validator
def actions(self):
while True:
logger.info("Listening for actions...")
logger.info(
"Connecting to Hatchet to establish listener for actions...")
try:
for assigned_action in self.get_listen_client():
assigned_action : AssignedAction
self.retries = 0
assigned_action: AssignedAction
# Process the received action
action_type = self.map_action_type(assigned_action.actionType)
@@ -110,10 +115,6 @@ class ActionListenerImpl(WorkerActionListener):
# Context cancelled, unsubscribe and close
# self.logger.debug("Context cancelled, closing listener")
break
elif e.code() == grpc.StatusCode.UNAVAILABLE:
# Retry logic
logger.info("Could not connect to Hatchet, retrying...")
self.retries = self.retries + 1
elif e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
logger.info("Deadline exceeded, retrying subscription")
continue
@@ -122,7 +123,8 @@ class ActionListenerImpl(WorkerActionListener):
# self.logger.error(f"Failed to receive message: {e}")
# err_ch(e)
logger.error(f"Failed to receive message: {e}")
break
self.retries = self.retries + 1
def parse_action_payload(self, payload : str):
try:
@@ -143,19 +145,32 @@ class ActionListenerImpl(WorkerActionListener):
return None
def get_listen_client(self):
current_time = int(time.time())
if current_time-self.last_connection_attempt > DEFAULT_ACTION_LISTENER_RETRY_INTERVAL:
self.retries = 0
if self.retries > DEFAULT_ACTION_LISTENER_RETRY_COUNT:
raise Exception(f"Could not subscribe to the worker after {DEFAULT_ACTION_LISTENER_RETRY_COUNT} retries")
elif self.retries > 1:
raise Exception(
f"Could not subscribe to the worker after {DEFAULT_ACTION_LISTENER_RETRY_COUNT} retries")
elif self.retries >= 1:
# logger.info
# if we are retrying, we wait for a bit. this should eventually be replaced with exp backoff + jitter
time.sleep(DEFAULT_ACTION_LISTENER_RETRY_INTERVAL)
return self.client.Listen(WorkerListenRequest(
workerId=self.worker_id
),
timeout=DEFAULT_ACTION_TIMEOUT,
metadata=get_metadata(self.token),
)
logger.info(
f"Could not connect to Hatchet, retrying... {self.retries}/{DEFAULT_ACTION_LISTENER_RETRY_COUNT}")
listener = self.client.Listen(WorkerListenRequest(
workerId=self.worker_id
),
timeout=DEFAULT_ACTION_TIMEOUT,
metadata=get_metadata(self.token),
)
self.last_connection_attempt = current_time
logger.info('Listener established.')
return listener
def unregister(self):
try:
+3 -9
View File
@@ -11,13 +11,13 @@ from typing import Any, Callable, Dict
from .workflow import WorkflowMeta
from .clients.dispatcher import GetActionListenerRequest, ActionListenerImpl, Action
from .dispatcher_pb2 import ActionType, StepActionEvent, StepActionEventType, GroupKeyActionEvent, GroupKeyActionEventType, STEP_EVENT_TYPE_COMPLETED, STEP_EVENT_TYPE_STARTED, STEP_EVENT_TYPE_FAILED, GROUP_KEY_EVENT_TYPE_STARTED, GROUP_KEY_EVENT_TYPE_COMPLETED, GROUP_KEY_EVENT_TYPE_FAILED
from .client import new_client
from .client import new_client
from concurrent.futures import ThreadPoolExecutor, Future
from google.protobuf.timestamp_pb2 import Timestamp
from .context import Context
from .logger import logger
# Worker class
class Worker:
def __init__(self, name: str, max_threads: int = 200, debug=False, handle_kill=True):
self.name = name
@@ -347,11 +347,5 @@ class Worker:
except grpc.RpcError as rpc_error:
logger.error(f"Could not start worker: {rpc_error}")
# if we are here, but not killing, then we should retry start
if not self.killing:
if retry_count > 5:
raise Exception("Could not start worker after 5 retries")
logger.info("Could not start worker, retrying...")
self.start(retry_count + 1)
logger.info("Could not start worker")
+1 -1
View File
@@ -1,6 +1,6 @@
[tool.poetry]
name = "hatchet-sdk"
version = "0.10.3"
version = "0.10.4"
description = ""
authors = ["Alexander Belanger <alexander@hatchet.run>"]
readme = "README.md"