cr-connect-workflow/crc/models/file.py

import enum
from typing import cast

from marshmallow_enum import EnumField
from marshmallow_sqlalchemy import SQLAlchemyAutoSchema
from sqlalchemy import func, Index
from sqlalchemy.dialects.postgresql import UUID

from crc import db, ma


class FileType(enum.Enum):
    bpmn = "bpmm"
    csv = 'csv'
    dmn = "dmn"
    doc = "doc"
    docx = "docx"
    gif = 'gif'
    jpg = 'jpg'
    md = 'md'
    pdf = 'pdf'
    png = 'png'
    ppt = 'ppt'
    pptx = 'pptx'
    rtf = 'rtf'
    svg = "svg"
    svg_xml = "svg+xml"
    txt = 'txt'
    xls = 'xls'
    xlsx = 'xlsx'
    xml = 'xml'
    zip = 'zip'


CONTENT_TYPES = {
    "bpmn":  "text/xml",
    "csv": "text/csv",
    "dmn": "text/xml",
    "doc": "application/msword",
    "docx":  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "gif": "image/gif",
    "jpg": "image/jpeg",
    "md" : "text/plain",
    "pdf": "application/pdf",
    "png": "image/png",
    "ppt": "application/vnd.ms-powerpoint",
    "pptx":  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    "rtf": "application/rtf",
    "svg": "image/svg+xml",
    "svg_xml": "image/svg+xml",
    "txt": "text/plain",
    "xls": "application/vnd.ms-excel",
    "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    "xml": "application/xml",
    "zip": "application/zip"
}

class FileDataModel(db.Model):
    __tablename__ = 'file_data'
    id = db.Column(db.Integer, primary_key=True)
    md5_hash = db.Column(UUID(as_uuid=True), unique=False, nullable=False)
    data = db.Column(db.LargeBinary)
    version = db.Column(db.Integer, default=0)
    last_updated = db.Column(db.DateTime(timezone=True), default=func.now())
    file_model_id = db.Column(db.Integer, db.ForeignKey('file.id'))
    file_model = db.relationship("FileModel")


class FileModel(db.Model):
    __tablename__ = 'file'
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String)
    type = db.Column(db.Enum(FileType))
    is_status = db.Column(db.Boolean)
    content_type = db.Column(db.String)
    is_reference = db.Column(db.Boolean, nullable=False, default=False) # A global reference file.
    primary = db.Column(db.Boolean, nullable=False, default=False) # Is this the primary BPMN in a workflow?
    primary_process_id = db.Column(db.String, nullable=True) # An id in the xml of BPMN documents, critical for primary BPMN.
    workflow_spec_id = db.Column(db.String, db.ForeignKey('workflow_spec.id'), nullable=True)
    workflow_id = db.Column(db.Integer, db.ForeignKey('workflow.id'), nullable=True)
    study_id = db.Column(db.Integer, db.ForeignKey('study.id'), nullable=True)
    task_id = db.Column(db.String, nullable=True)
    irb_doc_code = db.Column(db.String, nullable=True) # Code reference to the irb_documents.xlsx reference file.
    form_field_key = db.Column(db.String, nullable=True)
    latest_version = db.Column(db.Integer, default=0)


class FileModelSchema(SQLAlchemyAutoSchema):
    class Meta:
        model = FileModel
        load_instance = True
        include_relationships = True
        include_fk = True  # Includes foreign keys
    type = EnumField(FileType)


class LookupFileModel(db.Model):
    """Takes the content of a file (like a xlsx, or csv file) and creates a key/value
    store that can be used for lookups and searches. This table contains the metadata,
    so we know the version of the file that was used, and what key column, and value column
    were used to generate this lookup table.  ie, the same xls file might have multiple
    lookup file models, if different keys and labels are used - or someone decides to
    make a change.  We need to handle full text search over the label and value columns,
    and not every column, because we don't know how much information will be in there. """
    __tablename__ = 'lookup_file'
    id = db.Column(db.Integer, primary_key=True)
    label_column = db.Column(db.String)
    value_column = db.Column(db.String)
    file_data_model_id = db.Column(db.Integer, db.ForeignKey('file_data.id'))


class LookupDataModel(db.Model):
    __tablename__ = 'lookup_data'
    id = db.Column(db.Integer, primary_key=True)
    lookup_file_model_id = db.Column(db.Integer, db.ForeignKey('lookup_file.id'))
    lookup_file_model = db.relationship(LookupFileModel)
    value = db.Column(db.String)
    label = db.Column(db.String)
    # In the future, we might allow adding an additional "search" column if we want to search things not in label.
    data = db.Column(db.JSON) # all data for the row is stored in a json structure here, but not searched presently.

    # Assure there is a searchable index on the label column, so we can get fast results back.
    # query with:
    # search_results = LookupDataModel.query.filter(LookupDataModel.label.match("INTERNAL")).all()

    __table_args__ = (
        Index(
            'ix_lookupdata_tsv',
            func.to_tsvector('simple', label),  # Use simple, not english to keep stop words in place.
            postgresql_using='gin'
            ),
        )


class LookupDataSchema(SQLAlchemyAutoSchema):
    class Meta:
        model = LookupDataModel
        load_instance = True
        include_relationships = False
        include_fk = False  # Includes foreign keys


class SimpleFileSchema(ma.Schema):

    class Meta:
        model = FileModel
        fields = ["name"]
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00			`import enum`
Create lookup tables for XSL files referenced in workflows so we can do full text searches and populate lists on the fly quickly. 2020-04-22 19:37:02 +00:00			`from typing import cast`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00
			`from marshmallow_enum import EnumField`
Resolves marshmallow_sqlalchemy.ModelSchema deprecation warning 2020-03-16 17:37:31 +00:00			`from marshmallow_sqlalchemy import SQLAlchemyAutoSchema`
Adding a new reference file that provides greater details about the investigators related to a study. Improving the study_info script documentation to provide detailed examples of values returned based on arguments. Making the tests a little more targetted and less subject to breaking through better mocks. Allow all tests to pass even when ther protocol builder mock isn't running locally. Removing the duplication of reference files in tests and static, as this seems silly to me at the moment. 2020-05-07 17:57:24 +00:00			`from sqlalchemy import func, Index`
Improve version handling of files. Consolidate more of this logic in FileService. Place the version on the actual data model, not the file model, so the file model remains the same, and we just version the data associated with it. 2020-03-04 18:40:25 +00:00			`from sqlalchemy.dialects.postgresql import UUID`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00
Adding serialiazer for study files 2020-05-20 21:10:22 +00:00			`from crc import db, ma`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00

			`class FileType(enum.Enum):`
			`bpmn = "bpmm"`
Updates list of allowed files. Removes unnecessary migrations. Note this will require a full wipe & reset of the database to work. 2020-02-05 22:23:37 +00:00			`csv = 'csv'`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00			`dmn = "dmn"`
Updates list of allowed files. Removes unnecessary migrations. Note this will require a full wipe & reset of the database to work. 2020-02-05 22:23:37 +00:00			`doc = "doc"`
Adds workflow_id to file endpoints. 2020-02-04 19:26:53 +00:00			`docx = "docx"`
			`gif = 'gif'`
			`jpg = 'jpg'`
Updates list of allowed files. Removes unnecessary migrations. Note this will require a full wipe & reset of the database to work. 2020-02-05 22:23:37 +00:00			`md = 'md'`
Adds workflow_id to file endpoints. 2020-02-04 19:26:53 +00:00			`pdf = 'pdf'`
			`png = 'png'`
Updates list of allowed files. Removes unnecessary migrations. Note this will require a full wipe & reset of the database to work. 2020-02-05 22:23:37 +00:00			`ppt = 'ppt'`
			`pptx = 'pptx'`
			`rtf = 'rtf'`
Adds workflow_id to file endpoints. 2020-02-04 19:26:53 +00:00			`svg = "svg"`
Updates list of allowed files. Removes unnecessary migrations. Note this will require a full wipe & reset of the database to work. 2020-02-05 22:23:37 +00:00			`svg_xml = "svg+xml"`
			`txt = 'txt'`
			`xls = 'xls'`
Adds workflow_id to file endpoints. 2020-02-04 19:26:53 +00:00			`xlsx = 'xlsx'`
Updates list of allowed files. Removes unnecessary migrations. Note this will require a full wipe & reset of the database to work. 2020-02-05 22:23:37 +00:00			`xml = 'xml'`
Adds workflow_id to file endpoints. 2020-02-04 19:26:53 +00:00			`zip = 'zip'`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00
Adds default_value to Task schema 2020-02-18 15:14:03 +00:00
Improve version handling of files. Consolidate more of this logic in FileService. Place the version on the actual data model, not the file model, so the file model remains the same, and we just version the data associated with it. 2020-03-04 18:40:25 +00:00			`CONTENT_TYPES = {`
			`"bpmn": "text/xml",`
			`"csv": "text/csv",`
			`"dmn": "text/xml",`
			`"doc": "application/msword",`
			`"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",`
			`"gif": "image/gif",`
			`"jpg": "image/jpeg",`
			`"md" : "text/plain",`
			`"pdf": "application/pdf",`
			`"png": "image/png",`
			`"ppt": "application/vnd.ms-powerpoint",`
			`"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",`
			`"rtf": "application/rtf",`
			`"svg": "image/svg+xml",`
			`"svg_xml": "image/svg+xml",`
			`"txt": "text/plain",`
			`"xls": "application/vnd.ms-excel",`
			`"xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",`
			`"xml": "application/xml",`
			`"zip": "application/zip"`
			`}`

Refactor models into seperate directories 2019-12-31 21:32:47 +00:00			`class FileDataModel(db.Model):`
			`__tablename__ = 'file_data'`
			`id = db.Column(db.Integer, primary_key=True)`
Improve version handling of files. Consolidate more of this logic in FileService. Place the version on the actual data model, not the file model, so the file model remains the same, and we just version the data associated with it. 2020-03-04 18:40:25 +00:00			`md5_hash = db.Column(UUID(as_uuid=True), unique=False, nullable=False)`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00			`data = db.Column(db.LargeBinary)`
Improve version handling of files. Consolidate more of this logic in FileService. Place the version on the actual data model, not the file model, so the file model remains the same, and we just version the data associated with it. 2020-03-04 18:40:25 +00:00			`version = db.Column(db.Integer, default=0)`
			`last_updated = db.Column(db.DateTime(timezone=True), default=func.now())`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00			`file_model_id = db.Column(db.Integer, db.ForeignKey('file.id'))`
			`file_model = db.relationship("FileModel")`

Adds endpoints for creating and updating a Study. 2020-01-03 16:44:24 +00:00
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00			`class FileModel(db.Model):`
			`__tablename__ = 'file'`
			`id = db.Column(db.Integer, primary_key=True)`
			`name = db.Column(db.String)`
			`type = db.Column(db.Enum(FileType))`
Adds is_status flag to workflow specs 2020-03-13 18:56:46 +00:00			`is_status = db.Column(db.Boolean)`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00			`content_type = db.Column(db.String)`
Add the ability to upload and request general reference files by name. These will be used across workflows and will frequently contain lookup tables that can be referenced by various script tasks. 2020-03-13 19:03:57 +00:00			`is_reference = db.Column(db.Boolean, nullable=False, default=False) # A global reference file.`
Found a few errors in the sqlalchemy file definition that was causing failures, and had some consistency problems with the IRB Categories file name. The API was bailing out because we had restricted file types to bpmn,svg,or dmn in the connexion config file, I don't restrict this anymore we have plenty of checks elsewhere. Adding xlrd as a dependency - this didn't fail till a push to production. 2020-03-20 12:21:21 +00:00			`primary = db.Column(db.Boolean, nullable=False, default=False) # Is this the primary BPMN in a workflow?`
If you name add a file to a workflow that has the exact same name as a Task Spec's ID, and an extension of "md", it wll use that file as the markdown content, and ignore the markdown in the documentation on the task spec. Moving the primary process id from the workflow model to the file model, and assuring it is updated properly. This was causing a bug that would "lose" the workflow. 2020-04-17 17:30:32 +00:00			`primary_process_id = db.Column(db.String, nullable=True) # An id in the xml of BPMN documents, critical for primary BPMN.`
Wipes out old migrations, since changing foreign key constraints in SQLite requires a full database reset anyways. 2020-02-04 14:57:02 +00:00			`workflow_spec_id = db.Column(db.String, db.ForeignKey('workflow_spec.id'), nullable=True)`
			`workflow_id = db.Column(db.Integer, db.ForeignKey('workflow.id'), nullable=True)`
			`study_id = db.Column(db.Integer, db.ForeignKey('study.id'), nullable=True)`
			`task_id = db.Column(db.String, nullable=True)`
Assure that files uploaded through web forms and files generated from templates can be cross-referenced to known document requirements from the protocol builder. Configurators can control this by managing an XLS Spreadsheet called "irb_documents.xslx". Required Documents is becoming complicated, so making this it's own script task, removing it from study_info.py The file_service is now very aware of this irb_documents file, so it will always need to exist. We seed this file during setup, but it can be overwritten by the configurator. 2020-03-19 21:13:30 +00:00			`irb_doc_code = db.Column(db.String, nullable=True) # Code reference to the irb_documents.xlsx reference file.`
Adds form field key to file model. 2020-02-05 19:55:31 +00:00			`form_field_key = db.Column(db.String, nullable=True)`
Improve version handling of files. Consolidate more of this logic in FileService. Place the version on the actual data model, not the file model, so the file model remains the same, and we just version the data associated with it. 2020-03-04 18:40:25 +00:00			`latest_version = db.Column(db.Integer, default=0)`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00

Made some modifications to the Approval so that it knows exactly what versions of every file are being sent for approval Added the following columns: * date_created - so we know when the file was created * renamed workflow_version to just "version", because everything has a version, this is the version of the request. * workflow_hash - this is just a quick way to see what files and versions are associated with the request, it could be factored out. * study - a quick relationship link to the study, so that this model is easier to use. * workflow - ditto * approval_files - these is a list from a new link table that links an approval to specific files and versions. The RequestApproval is logically sound, but still needs some additional pieces in place to be callable from a BPMN workflow diagram. Altered the file service to pick up on changes to files vs adding new files, so that versions are picked up correctly as users modify their submission - adding new files or replacing existing ones. Deleting files worries me, and I will need to revisit this. The damn base test keeps giving me a headache, so I made changes there to see if clearing and dropping the database each time won't allow the tests to pass more consistently. Lots more tests around the file service to make sure it is versioning user uploaded files correctly. The "Test Request Approval Script" tries to find to assure the correct behavior as this is likely to be called many times repeatedly and with little knowledge of the internal system. So it should just "do the right thing". 2020-05-23 19:08:17 +00:00
Resolves marshmallow_sqlalchemy.ModelSchema deprecation warning 2020-03-16 17:37:31 +00:00			`class FileModelSchema(SQLAlchemyAutoSchema):`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00			`class Meta:`
			`model = FileModel`
Resolves marshmallow_sqlalchemy.ModelSchema deprecation warning 2020-03-16 17:37:31 +00:00			`load_instance = True`
			`include_relationships = True`
Includes workflow spec id 2020-01-13 22:52:37 +00:00			`include_fk = True # Includes foreign keys`
Refactor models into seperate directories 2019-12-31 21:32:47 +00:00			`type = EnumField(FileType)`
Create lookup tables for XSL files referenced in workflows so we can do full text searches and populate lists on the fly quickly. 2020-04-22 19:37:02 +00:00

			`class LookupFileModel(db.Model):`
			`"""Takes the content of a file (like a xlsx, or csv file) and creates a key/value`
			`store that can be used for lookups and searches. This table contains the metadata,`
			`so we know the version of the file that was used, and what key column, and value column`
			`were used to generate this lookup table. ie, the same xls file might have multiple`
			`lookup file models, if different keys and labels are used - or someone decides to`
			`make a change. We need to handle full text search over the label and value columns,`
			`and not every column, because we don't know how much information will be in there. """`
			`__tablename__ = 'lookup_file'`
			`id = db.Column(db.Integer, primary_key=True)`
			`label_column = db.Column(db.String)`
			`value_column = db.Column(db.String)`
			`file_data_model_id = db.Column(db.Integer, db.ForeignKey('file_data.id'))`


			`class LookupDataModel(db.Model):`
			`__tablename__ = 'lookup_data'`
			`id = db.Column(db.Integer, primary_key=True)`
			`lookup_file_model_id = db.Column(db.Integer, db.ForeignKey('lookup_file.id'))`
			`lookup_file_model = db.relationship(LookupFileModel)`
			`value = db.Column(db.String)`
			`label = db.Column(db.String)`
			`# In the future, we might allow adding an additional "search" column if we want to search things not in label.`
			`data = db.Column(db.JSON) # all data for the row is stored in a json structure here, but not searched presently.`

			`# Assure there is a searchable index on the label column, so we can get fast results back.`
			`# query with:`
			`# search_results = LookupDataModel.query.filter(LookupDataModel.label.match("INTERNAL")).all()`

			`__table_args__ = (`
			`Index(`
			`'ix_lookupdata_tsv',`
better overall search results for type ahead. Still dealing with stop words failing. 2020-04-23 16:05:08 +00:00			`func.to_tsvector('simple', label), # Use simple, not english to keep stop words in place.`
Create lookup tables for XSL files referenced in workflows so we can do full text searches and populate lists on the fly quickly. 2020-04-22 19:37:02 +00:00			`postgresql_using='gin'`
			`),`
			`)`

Adding an API Endpoint that will return a list of LookupValues that match a given query - can be used to populate an auto-complete table. 2020-04-22 23:40:40 +00:00
			`class LookupDataSchema(SQLAlchemyAutoSchema):`
			`class Meta:`
			`model = LookupDataModel`
			`load_instance = True`
			`include_relationships = False`
			`include_fk = False # Includes foreign keys`

Adding serialiazer for study files 2020-05-20 21:10:22 +00:00
			`class SimpleFileSchema(ma.Schema):`

			`class Meta:`
			`model = FileModel`
			`fields = ["name"]`