Feature/unlock stale locks (#360)

* set locked_at_in_seconds so we can find stale locks w/ burnettk

* added background job to cleanup old locks w/ burnettk

---------

Co-authored-by: jasquat <jasquat@users.noreply.github.com>
This commit is contained in:
jasquat 2023-07-12 10:15:27 -04:00 committed by GitHub
parent c936c02db0
commit c2706c5bde
7 changed files with 57 additions and 1 deletions

View File

@ -92,6 +92,11 @@ def start_scheduler(app: flask.app.Flask, scheduler_class: BaseScheduler = Backg
"interval",
seconds=user_input_required_polling_interval_in_seconds,
)
scheduler.add_job(
BackgroundProcessingService(app).remove_stale_locks,
"interval",
seconds=app.config["MAX_INSTANCE_LOCK_DURATION_IN_SECONDS"],
)
scheduler.start()

View File

@ -185,6 +185,10 @@ def setup_config(app: Flask) -> None:
if app.config["SPIFFWORKFLOW_BACKEND_DEFAULT_USER_GROUP"] == "":
app.config["SPIFFWORKFLOW_BACKEND_DEFAULT_USER_GROUP"] = None
app.config["MAX_INSTANCE_LOCK_DURATION_IN_SECONDS"] = int(
app.config["SPIFFWORKFLOW_BACKEND_MAX_INSTANCE_LOCK_DURATION_IN_SECONDS"]
)
thread_local_data = threading.local()
app.config["THREAD_LOCAL_DATA"] = thread_local_data
_set_up_tenant_specific_fields_as_list_of_strings(app)

View File

@ -188,3 +188,7 @@ SPIFFWORKFLOW_BACKEND_ELEMENT_UNITS_CACHE_DIR = environ.get(
SPIFFWORKFLOW_BACKEND_USE_WERKZEUG_MIDDLEWARE_PROXY_FIX = (
environ.get("SPIFFWORKFLOW_BACKEND_USE_WERKZEUG_MIDDLEWARE_PROXY_FIX", default="false") == "true"
)
SPIFFWORKFLOW_BACKEND_MAX_INSTANCE_LOCK_DURATION_IN_SECONDS = environ.get(
"SPIFFWORKFLOW_BACKEND_MAX_INSTANCE_LOCK_DURATION_IN_SECONDS", default="300"
)

View File

@ -15,10 +15,11 @@ class ProcessInstanceQueueModel(SpiffworkflowBaseDBModel):
process_instance_id: int = db.Column(
ForeignKey(ProcessInstanceModel.id), unique=True, nullable=False # type: ignore
)
run_at_in_seconds: int = db.Column(db.Integer)
priority: int = db.Column(db.Integer)
locked_by: str | None = db.Column(db.String(80), index=True, nullable=True)
locked_at_in_seconds: int | None = db.Column(db.Integer, index=True, nullable=True)
status: str = db.Column(db.String(50), index=True)
run_at_in_seconds: int = db.Column(db.Integer)
updated_at_in_seconds: int = db.Column(db.Integer)
created_at_in_seconds: int = db.Column(db.Integer)

View File

@ -34,3 +34,8 @@ class BackgroundProcessingService:
with self.app.app_context():
ProcessInstanceLockService.set_thread_local_locking_context("bg:messages")
MessageService.correlate_all_message_instances()
def remove_stale_locks(self) -> None:
"""If something has been locked for a certain amount of time it is probably stale so unlock it."""
with self.app.app_context():
ProcessInstanceLockService.remove_stale_locks()

View File

@ -1,8 +1,12 @@
import threading
import time
from typing import Any
from flask import current_app
from spiffworkflow_backend.models.db import db
from spiffworkflow_backend.models.process_instance_queue import ProcessInstanceQueueModel
from sqlalchemy import and_
from sqlalchemy import or_
class ExpectedLockNotFoundError(Exception):
@ -62,3 +66,34 @@ class ProcessInstanceLockService:
def has_lock(cls, process_instance_id: int) -> bool:
ctx = cls.get_thread_local_locking_context()
return process_instance_id in ctx["locks"]
@classmethod
def remove_stale_locks(cls) -> None:
max_duration = current_app.config["MAX_INSTANCE_LOCK_DURATION_IN_SECONDS"]
current_time = round(time.time())
five_min_ago = current_time - max_duration
# TODO: remove check for NULL locked_at_in_seconds and fallback to updated_at_in_seconds
# once we can confirm that old entries have been taken care of on current envs.
# New code should not allow rows where locked_by has a value but locked_at_in_seconds is null.
entries_with_stale_locks = ProcessInstanceQueueModel.query.filter(
ProcessInstanceQueueModel.locked_by != None, # noqa: E711
or_(
ProcessInstanceQueueModel.locked_at_in_seconds <= five_min_ago,
and_(
ProcessInstanceQueueModel.updated_at_in_seconds <= five_min_ago,
ProcessInstanceQueueModel.locked_at_in_seconds == None, # noqa: E711
),
),
).all()
for entry in entries_with_stale_locks:
locked_duration = current_time - (entry.locked_at_in_seconds or entry.updated_at_in_seconds)
current_app.logger.info(
f"Removing stale lock for process instance: {entry.process_instance_id} with locked_by:"
f" '{entry.locked_by}' because it has been locked for seconds: {locked_duration}"
)
entry.locked_by = None
entry.locked_at_in_seconds = None
db.session.add(entry)
db.session.commit()

View File

@ -51,6 +51,7 @@ class ProcessInstanceQueueService:
@classmethod
def _dequeue(cls, process_instance: ProcessInstanceModel) -> None:
locked_by = ProcessInstanceLockService.locked_by()
current_time = round(time.time())
db.session.query(ProcessInstanceQueueModel).filter(
ProcessInstanceQueueModel.process_instance_id == process_instance.id,
@ -58,6 +59,7 @@ class ProcessInstanceQueueService:
).update(
{
"locked_by": locked_by,
"locked_at_in_seconds": current_time,
}
)