spiff-arena/spiffworkflow-backend/bin/data_migrations/run_all.py

import time

from flask import current_app
from spiffworkflow_backend import create_app
from spiffworkflow_backend.data_migrations.version_1_3 import VersionOneThree
from spiffworkflow_backend.data_migrations.version_2 import Version2
from spiffworkflow_backend.models.db import db
from spiffworkflow_backend.models.human_task import HumanTaskModel
from spiffworkflow_backend.models.process_instance import ProcessInstanceModel
from sqlalchemy import update


# simple decorator to time the func
# https://stackoverflow.com/a/11151365/6090676, thank you
def benchmark_log_func(func):
    """
    decorator to calculate the total time of a func
    """

    def st_func(*args, **kwargs):
        t1 = time.time()
        r = func(*args, **kwargs)
        t2 = time.time()
        # __qualname__, i know you use it every day. but if not, it's the function name prefixed with any qualifying class names
        current_app.logger.debug(f"Function={func.__qualname__}, Time={t2 - t1}")
        return r

    return st_func


@benchmark_log_func
def put_serializer_version_onto_numeric_track() -> None:
    old_busted_serializer_version = "1.0-spiffworkflow-backend"
    update_query = (
        update(ProcessInstanceModel)
        .where(ProcessInstanceModel.spiff_serializer_version == old_busted_serializer_version)
        .values(spiff_serializer_version="1")
    )
    db.session.execute(update_query)
    db.session.commit()


@benchmark_log_func
def remove_duplicate_human_task_rows() -> None:
    result = (
        db.session.query(HumanTaskModel.process_instance_id, HumanTaskModel.task_guid, db.func.count().label("ct"))
        .group_by(HumanTaskModel.task_guid, HumanTaskModel.process_instance_id)
        .having(db.func.count() > 1)
        .all()
    )

    # Process the result as needed
    rows_to_delete = []
    for row in result:
        human_tasks = (
            HumanTaskModel.query.filter_by(task_guid=row.task_guid).order_by(HumanTaskModel.created_at_in_seconds.desc()).all()
        )
        rows_to_delete = rows_to_delete + human_tasks[1:]
    for row in rows_to_delete:
        db.session.delete(row)
    db.session.commit()


@benchmark_log_func
def backfill_task_guid_for_human_tasks() -> None:
    update_query = (
        update(HumanTaskModel).where(HumanTaskModel.task_guid == None).values(task_guid=HumanTaskModel.task_id)  # noqa: E711
    )
    db.session.execute(update_query)
    db.session.commit()


def all_potentially_relevant_process_instances() -> list[ProcessInstanceModel]:
    return ProcessInstanceModel.query.filter(
        ProcessInstanceModel.spiff_serializer_version < Version2.version(),
        ProcessInstanceModel.status.in_(ProcessInstanceModel.non_terminal_statuses()),
    ).all()


@benchmark_log_func
def run_version_1() -> None:
    VersionOneThree().run()  # make this a class method


@benchmark_log_func
def run_version_2(process_instances: list[ProcessInstanceModel]) -> None:
    Version2.run(process_instances)


def main() -> None:
    start_time = time.time()
    app = create_app()
    end_time = time.time()

    with app.app_context():
        current_app.logger.debug(f"data_migrations/run_all::create_app took {end_time - start_time} seconds")
        start_time = time.time()
        put_serializer_version_onto_numeric_track()
        remove_duplicate_human_task_rows()
        backfill_task_guid_for_human_tasks()
        process_instances = all_potentially_relevant_process_instances()
        potentially_relevant_instance_count = len(process_instances)
        current_app.logger.debug(f"Found potentially relevant process_instances: {potentially_relevant_instance_count}")
        if potentially_relevant_instance_count > 0:
            run_version_1()
            # this will run while using the new per instance on demand data migration framework
            # run_version_2(process_instances)

        end_time = time.time()
        current_app.logger.debug(
            f"done running data migrations in ./bin/data_migrations/run_all.py. took {end_time - start_time} seconds"
        )


if __name__ == "__main__":
    main()