Improve version handling of files. Consolidate more of this logic in FileService. Place the version on the actual data model, not the file model, so the file model remains the same, and we just version the data associated with it.

2025-02-20 11:48:16 +00:00 · 2020-03-04 13:40:25 -05:00 · 2020-03-04 13:40:25 -05:00 · c5cee4761e
commit c5cee4761e
parent 94f828dfd6
9 changed files with 140 additions and 103 deletions
--- a/crc/api/file.py
+++ b/crc/api/file.py
@ -8,6 +8,7 @@ from flask import send_file
 from crc import session
 from crc.api.common import ApiErrorSchema, ApiError
 from crc.models.file import FileModelSchema, FileModel, FileDataModel, FileType
+from crc.models.workflow import WorkflowSpecModel
 from crc.services.file_service import FileService


@ -34,7 +35,8 @@ def add_file(workflow_spec_id=None, study_id=None, workflow_id=None, task_id=Non

    file = connexion.request.files['file']
    if workflow_spec_id:
-        file_model = FileService.add_workflow_spec_file(workflow_spec_id, file.filename, file.content_type, file.stream.read())
+        workflow_spec = session.query(WorkflowSpecModel).filter_by(id=workflow_spec_id).first()
+        file_model = FileService.add_workflow_spec_file(workflow_spec, file.filename, file.content_type, file.stream.read())
    else:
        file_model = FileService.add_form_field_file(study_id, workflow_id, task_id, form_field_key, file.filename, file.content_type, file.stream.read())

--- a/crc/api/workflow.py
+++ b/crc/api/workflow.py
@ -76,7 +76,7 @@ def __get_workflow_api_model(processor: WorkflowProcessor):
        user_tasks=user_tasks,
        workflow_spec_id=processor.workflow_spec_id
    )
-    if(processor.next_task()):
+    if processor.next_task():
        workflow_api.next_task = Task.from_spiff(processor.next_task())
    return workflow_api

--- a/crc/models/file.py
+++ b/crc/models/file.py
@ -3,6 +3,7 @@ import enum
 from marshmallow_enum import EnumField
 from marshmallow_sqlalchemy import ModelSchema
 from sqlalchemy import func
+from sqlalchemy.dialects.postgresql import UUID

 from crc import db

@ -30,10 +31,36 @@ class FileType(enum.Enum):
    zip = 'zip'


+CONTENT_TYPES = {
+    "bpmn":  "text/xml",
+    "csv": "text/csv",
+    "dmn": "text/xml",
+    "doc": "application/msword",
+    "docx":  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    "gif": "image/gif",
+    "jpg": "image/jpeg",
+    "md" : "text/plain",
+    "pdf": "application/pdf",
+    "png": "image/png",
+    "ppt": "application/vnd.ms-powerpoint",
+    "pptx":  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    "rtf": "application/rtf",
+    "svg": "image/svg+xml",
+    "svg_xml": "image/svg+xml",
+    "txt": "text/plain",
+    "xls": "application/vnd.ms-excel",
+    "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    "xml": "application/xml",
+    "zip": "application/zip"
+}
+
 class FileDataModel(db.Model):
    __tablename__ = 'file_data'
    id = db.Column(db.Integer, primary_key=True)
+    md5_hash = db.Column(UUID(as_uuid=True), unique=False, nullable=False)
    data = db.Column(db.LargeBinary)
+    version = db.Column(db.Integer, default=0)
+    last_updated = db.Column(db.DateTime(timezone=True), default=func.now())
    file_model_id = db.Column(db.Integer, db.ForeignKey('file.id'))
    file_model = db.relationship("FileModel")

@ -42,8 +69,6 @@ class FileModel(db.Model):
    __tablename__ = 'file'
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String)
-    version = db.Column(db.Integer, default=0)
-    last_updated = db.Column(db.DateTime(timezone=True), default=func.now())
    type = db.Column(db.Enum(FileType))
    primary = db.Column(db.Boolean)
    content_type = db.Column(db.String)
@ -52,6 +77,7 @@ class FileModel(db.Model):
    study_id = db.Column(db.Integer, db.ForeignKey('study.id'), nullable=True)
    task_id = db.Column(db.String, nullable=True)
    form_field_key = db.Column(db.String, nullable=True)
+    latest_version = db.Column(db.Integer, default=0)


 class FileModelSchema(ModelSchema):
--- a/crc/models/workflow.py
+++ b/crc/models/workflow.py
@ -1,12 +1,14 @@
 import enum

+import jinja2
 import marshmallow
-from jinja2 import Environment, BaseLoader, Undefined, Template
+from jinja2 import Template
 from marshmallow import INCLUDE
 from marshmallow_enum import EnumField
 from marshmallow_sqlalchemy import ModelSchema

 from crc import db, ma
+from crc.api.common import ApiError


 class WorkflowSpecModel(db.Model):
@ -38,6 +40,7 @@ class WorkflowModel(db.Model):
    study_id = db.Column(db.Integer, db.ForeignKey('study.id'))
    workflow_spec_id = db.Column(db.String, db.ForeignKey('workflow_spec.id'))

+
 class Task(object):
    def __init__(self, id, name, title, type, state, form, documentation, data):
        self.id = id
@ -72,8 +75,11 @@ class Task(object):
        create loops, etc...'''

        template = Template(documentation)
-        self.documentation = template.render(**self.data)
-
+        try:
+            self.documentation = template.render(**self.data)
+        except jinja2.exceptions.UndefinedError as ue:
+            raise ApiError(code="template_error", message="Error processing template for task %s: %s" %
+                                                          (self.name, str(ue)), status_code=500)

 class OptionSchema(ma.Schema):
    class Meta:
--- a/crc/scripts/complete_template.py
+++ b/crc/scripts/complete_template.py
@ -4,7 +4,7 @@ from jinja2 import UndefinedError

 from crc import session
 from crc.api.common import ApiError
-from crc.models.file import FileModel, FileDataModel
+from crc.models.file import FileModel, FileDataModel, CONTENT_TYPES
 from crc.models.workflow import WorkflowSpecModel
 from docxtpl import DocxTemplate
 import jinja2
@ -48,7 +48,7 @@ class CompleteTemplate(Script):
        workflow_id = task.workflow.data[WorkflowProcessor.WORKFLOW_ID_KEY]
        FileService.add_task_file(study_id=study_id, workflow_id=workflow_id, task_id=task.id,
                                  name=file_name,
-                                  content_type=FileService.DOCX_MIME,
+                                  content_type=CONTENT_TYPES['docx'],
                                  binary_data=final_document_stream.read())

        print("Complete Task was called with %s" % str(args))
--- a/crc/services/file_service.py
+++ b/crc/services/file_service.py
@ -1,24 +1,33 @@
 import os
 from datetime import datetime
+from uuid import UUID
+from xml.etree import ElementTree

 from crc import session
 from crc.api.common import ApiErrorSchema, ApiError
-from crc.models.file import FileType, FileDataModel, FileModelSchema, FileModel
+from crc.models.file import FileType, FileDataModel, FileModelSchema, FileModel, CONTENT_TYPES
+from crc.models.workflow import WorkflowSpecModel
+from crc.services.workflow_processor import WorkflowProcessor
+import hashlib


 class FileService(object):
    """Provides consistent management and rules for storing, retrieving and processing files."""

-    DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-
    @staticmethod
-    def add_workflow_spec_file(workflow_spec_id, name, content_type, binary_data):
+    def add_workflow_spec_file(workflow_spec: WorkflowSpecModel,
+                               name, content_type, binary_data, primary=False):
        """Create a new file and associate it with a workflow spec."""
        file_model = FileModel(
-            version=0,
-            workflow_spec_id=workflow_spec_id,
+            workflow_spec_id=workflow_spec.id,
            name=name,
+            primary=primary
        )
+        if primary:
+            bpmn: ElementTree.Element = ElementTree.fromstring(binary_data)
+            workflow_spec.primary_process_id = WorkflowProcessor.get_process_id(bpmn)
+            print("Locating Process Id for " + name + "  " + workflow_spec.primary_process_id)
+
        return FileService.update_file(file_model, binary_data, content_type)

    @staticmethod
@ -38,7 +47,6 @@ class FileService(object):
    def add_task_file(study_id, workflow_id, task_id, name, content_type, binary_data):
        """Create a new file and associate it with an executing task within a workflow."""
        file_model = FileModel(
-            version=0,
            study_id=study_id,
            workflow_id=workflow_id,
            task_id=task_id,
@ -49,9 +57,14 @@ class FileService(object):
    @staticmethod
    def update_file(file_model, binary_data, content_type):

-        file_model.version = file_model.version + 1
-        file_model.last_updated = datetime.now()
-        file_model.content_type = content_type
+        file_data_model = session.query(FileDataModel).\
+            filter_by(file_model_id=file_model.id,
+                      version=file_model.latest_version
+                      ).with_for_update().first()
+        md5_checksum = UUID(hashlib.md5(binary_data).hexdigest())
+        if(file_data_model is not None and md5_checksum == file_data_model.md5_hash):
+            # This file does not need to be updated, it's the same file.
+            return file_model

        # Verify the extension
        basename, file_extension = os.path.splitext(file_model.name)
@ -62,12 +75,16 @@ class FileService(object):
                                                  file_extension)), 404
        else:
            file_model.type = FileType[file_extension]
+            file_model.content_type = content_type

-        file_data_model = session.query(FileDataModel).filter_by(file_model_id=file_model.id).with_for_update().first()
        if file_data_model is None:
-            file_data_model = FileDataModel(data=binary_data, file_model=file_model)
+            version = 1
        else:
-            file_data_model.data = binary_data
+            version = file_data_model.version + 1
+
+        file_model.latest_version = version
+        file_data_model = FileDataModel(data=binary_data, file_model=file_model, version=version,
+                                        md5_hash=md5_checksum)

        session.add_all([file_model, file_data_model])
        session.commit()
@ -94,4 +111,8 @@ class FileService(object):
    @staticmethod
    def get_file_data(file_id):
        """Returns the file_data that is associated with the file model id"""
-        return session.query(FileDataModel).filter(FileDataModel.file_model_id == file_id).first()
+        file_model = session.query(FileModel).filter(FileModel.id == file_id).first()
+        return session.query(FileDataModel)\
+            .filter(FileDataModel.file_model_id == file_id)\
+            .filter(FileDataModel.version == file_model.latest_version)\
+            .first()
--- a/example_data.py
+++ b/example_data.py
@ -4,15 +4,23 @@ import os
 import xml.etree.ElementTree as ElementTree

 from crc import app, db, session
-from crc.models.file import FileType, FileModel, FileDataModel
+from crc.models.file import FileType, FileModel, FileDataModel, CONTENT_TYPES
 from crc.models.study import StudyModel
 from crc.models.user import UserModel
 from crc.models.workflow import WorkflowSpecModel
+from crc.services.file_service import FileService
 from crc.services.workflow_processor import WorkflowProcessor


 class ExampleDataLoader:
-    def make_data(self):
+    @staticmethod
+    def clean_db():
+        session.flush()  # Clear out any transactions before deleting it all to avoid spurious errors.
+        for table in reversed(db.metadata.sorted_tables):
+            session.execute(table.delete())
+        session.flush()
+
+    def load_all(self):
        users = [
            UserModel(
                uid='dhf8r',
@ -25,6 +33,8 @@ class ExampleDataLoader:
                title='SOFTWARE ENGINEER V'
            )
        ]
+        db.session.add_all(users)
+        db.session.commit()

        studies = [
            StudyModel(
@ -48,89 +58,47 @@ class ExampleDataLoader:
                user_uid='dhf8r'
            ),
        ]
+        db.session.add_all(studies)
+        db.session.commit()

-        workflow_specifications = \
-            self.create_spec(id="crc2_training_session_enter_core_info",
-                             name="crc2_training_session_enter_core_info",
-                             display_name="CR Connect2 - Training Session - Core Info",
-                             description='Part of Milestone 3 Deliverable')
-        workflow_specifications += \
-            self.create_spec(id="crc2_training_session_data_security_plan",
-                             name="crc2_training_session_data_security_plan",
-                             display_name="CR Connect2 - Training Session - Data Security Plan",
-                             description='Part of Milestone 3 Deliverable')
-        workflow_specifications += \
-            self.create_spec(id="sponsor_funding_source",
-                             name="sponsor_funding_source",
-                             display_name="Sponsor and/or Funding Source ",
-                             description='TBD')
-        # workflow_specifications += \
-        # self.create_spec(id="m2_demo",
-        #                  name="m2_demo",
-        #                  display_name="Milestone 2 Demo",
-        #                  description='A simplified CR Connect workflow for demonstration purposes.')
-        # workflow_specifications += \
-        #     self.create_spec(id="crc_study_workflow",
-        #                      name="crc_study_workflow",
-        #                      display_name="CR Connect Study Workflow",
-        #                      description='Draft workflow for CR Connect studies.')
-        all_data = users + studies + workflow_specifications
-        return all_data
+        self.create_spec(id="crc2_training_session_enter_core_info",
+                         name="crc2_training_session_enter_core_info",
+                         display_name="CR Connect2 - Training Session - Core Info",
+                         description='Part of Milestone 3 Deliverable')
+        self.create_spec(id="crc2_training_session_data_security_plan",
+                         name="crc2_training_session_data_security_plan",
+                         display_name="CR Connect2 - Training Session - Data Security Plan",
+                         description='Part of Milestone 3 Deliverable')
+        self.create_spec(id="sponsor_funding_source",
+                         name="sponsor_funding_source",
+                         display_name="Sponsor and/or Funding Source ",
+                         description='TBD')

    def create_spec(self, id, name, display_name="", description="", filepath=None):
        """Assumes that a directory exists in static/bpmn with the same name as the given id.
           further assumes that the [id].bpmn is the primary file for the workflow.
           returns an array of data models to be added to the database."""
-        models = []
+        file_service = FileService()
+
        spec = WorkflowSpecModel(id=id,
                                 name=name,
                                 display_name=display_name,
                                 description=description)
-        models.append(spec)
+        db.session.add(spec)
+        db.session.commit()
        if not filepath:
            filepath = os.path.join(app.root_path, 'static', 'bpmn', id, "*")
        files = glob.glob(filepath)
        for file_path in files:
            noise, file_extension = os.path.splitext(file_path)
            filename = os.path.basename(file_path)
-            if file_extension.lower() == '.bpmn':
-                type = FileType.bpmn
-            elif file_extension.lower() == '.dmn':
-                type = FileType.dmn
-            elif file_extension.lower() == '.svg':
-                type = FileType.svg
-            elif file_extension.lower() == '.docx':
-                type = FileType.docx
-            else:
-                raise Exception("Unsupported file type:" + file_path)
-                continue
-
            is_primary = filename.lower() == id + ".bpmn"
-            file_model = FileModel(name=filename, type=type, content_type='text/xml', version="1",
-                                   last_updated=datetime.datetime.now(), primary=is_primary,
-                                   workflow_spec_id=id)
-            models.append(file_model)
            try:
                file = open(file_path, "rb")
                data = file.read()
-                if (is_primary):
-                    bpmn: ElementTree.Element = ElementTree.fromstring(data)
-                    spec.primary_process_id = WorkflowProcessor.get_process_id(bpmn)
-                    print("Locating Process Id for " + filename + "  " + spec.primary_process_id)
-                models.append(FileDataModel(data=data, file_model=file_model))
+                content_type = CONTENT_TYPES[file_extension[1:]]
+                file_service.add_workflow_spec_file(workflow_spec=spec, name=filename, content_type=content_type,
+                                                    binary_data=data, primary=is_primary)
            finally:
                file.close()
-        return models
-
-    @staticmethod
-    def clean_db():
-        session.flush()  # Clear out any transactions before deleting it all to avoid spurious errors.
-        for table in reversed(db.metadata.sorted_tables):
-            session.execute(table.delete())
-        session.flush()
-
-    def load_all(self):
-        for data in self.make_data():
-            session.add(data)
-            session.commit()
-        session.flush()
+        return spec
--- a/tests/base_test.py
+++ b/tests/base_test.py
@ -92,15 +92,7 @@ class BaseTest(unittest.TestCase):
        if session.query(WorkflowSpecModel).filter_by(id=dir_name).count() > 0:
            return
        filepath = os.path.join(app.root_path, '..', 'tests', 'data', dir_name, "*")
-        models = ExampleDataLoader().create_spec(id=dir_name, name=dir_name, filepath=filepath)
-        spec = None
-        for model in models:
-            if isinstance(model, WorkflowSpecModel):
-                spec = model
-            session.add(model)
-            session.commit()
-        session.flush()
-        return spec
+        return ExampleDataLoader().create_spec(id=dir_name, name=dir_name, filepath=filepath)

    @staticmethod
    def protocol_builder_response(file_name):
--- a/tests/test_files_api.py
+++ b/tests/test_files_api.py
@ -25,7 +25,7 @@ class TestFilesApi(BaseTest):
    def test_list_multiple_files_for_workflow_spec(self):
        self.load_example_data()
        spec = session.query(WorkflowSpecModel).first()
-        svgFile = FileModel(name="test.svg", type=FileType.svg, version=1, last_updated=datetime.now(),
+        svgFile = FileModel(name="test.svg", type=FileType.svg,
                            primary=False, workflow_spec_id=spec.id)
        session.add(svgFile)
        session.flush()
@ -47,7 +47,6 @@ class TestFilesApi(BaseTest):
        self.assertIsNotNone(rv.get_data())
        json_data = json.loads(rv.get_data(as_text=True))
        file = FileModelSchema().load(json_data, session=session)
-        self.assertEqual(1, file.version)
        self.assertEqual(FileType.svg, file.type)
        self.assertFalse(file.primary)
        self.assertEqual("image/svg+xml", file.content_type)
@ -89,13 +88,36 @@ class TestFilesApi(BaseTest):
        self.assertIsNotNone(rv.get_data())
        json_data = json.loads(rv.get_data(as_text=True))
        file = FileModelSchema().load(json_data, session=session)
-        self.assertEqual(2, file.version)
+        self.assertEqual(2, file.latest_version)
        self.assertEqual(FileType.bpmn, file.type)
        self.assertEqual("application/octet-stream", file.content_type)
        self.assertEqual(spec.id, file.workflow_spec_id)

-        data_model = session.query(FileDataModel).filter_by(file_model_id=file.id).first()
-        self.assertEqual(b"hijklim", data_model.data)
+        rv = self.app.get('/v1.0/file/%i/data' % file.id)
+        self.assert_success(rv)
+        data = rv.get_data()
+        self.assertIsNotNone(data)
+        self.assertEqual(b"hijklim", data)
+
+    def test_update_with_same_exact_data_does_not_increment_version(self):
+        self.load_example_data()
+        spec = session.query(WorkflowSpecModel).first()
+        data = {}
+        data['file'] = io.BytesIO(b"abcdef"), 'my_new_file.bpmn'
+        rv = self.app.post('/v1.0/file?workflow_spec_id=%s' % spec.id, data=data, follow_redirects=True,
+                           content_type='multipart/form-data')
+        self.assertIsNotNone(rv.get_data())
+        json_data = json.loads(rv.get_data(as_text=True))
+        file = FileModelSchema().load(json_data, session=session)
+        self.assertEqual(1, file.latest_version)
+        data['file'] = io.BytesIO(b"abcdef"), 'my_new_file.bpmn'
+        rv = self.app.put('/v1.0/file/%i/data' % file.id, data=data, follow_redirects=True,
+                          content_type='multipart/form-data')
+        self.assertIsNotNone(rv.get_data())
+        json_data = json.loads(rv.get_data(as_text=True))
+        file = FileModelSchema().load(json_data, session=session)
+        self.assertEqual(1, file.latest_version)
+

    def test_get_file(self):
        self.load_example_data()