2020-04-08 17:28:43 +00:00
|
|
|
import json
|
2020-02-10 21:19:23 +00:00
|
|
|
import os
|
|
|
|
from datetime import datetime
|
2020-03-04 18:40:25 +00:00
|
|
|
from uuid import UUID
|
|
|
|
from xml.etree import ElementTree
|
2020-02-10 21:19:23 +00:00
|
|
|
|
2020-03-19 21:13:30 +00:00
|
|
|
from pandas import ExcelFile
|
|
|
|
|
2020-02-10 21:19:23 +00:00
|
|
|
from crc import session
|
2020-03-13 19:03:57 +00:00
|
|
|
from crc.api.common import ApiError
|
2020-04-24 10:58:24 +00:00
|
|
|
from crc.models.file import FileType, FileDataModel, FileModel, LookupFileModel, LookupDataModel
|
2020-03-04 18:40:25 +00:00
|
|
|
from crc.models.workflow import WorkflowSpecModel
|
|
|
|
from crc.services.workflow_processor import WorkflowProcessor
|
|
|
|
import hashlib
|
2020-02-10 21:19:23 +00:00
|
|
|
|
|
|
|
|
|
|
|
class FileService(object):
|
|
|
|
"""Provides consistent management and rules for storing, retrieving and processing files."""
|
2020-05-07 17:57:24 +00:00
|
|
|
DOCUMENT_LIST = "irb_documents.xlsx"
|
|
|
|
INVESTIGATOR_LIST = "investigators.xlsx"
|
2020-02-10 21:19:23 +00:00
|
|
|
|
|
|
|
@staticmethod
|
2020-03-04 18:40:25 +00:00
|
|
|
def add_workflow_spec_file(workflow_spec: WorkflowSpecModel,
|
2020-03-13 18:56:46 +00:00
|
|
|
name, content_type, binary_data, primary=False, is_status=False):
|
2020-02-10 21:19:23 +00:00
|
|
|
"""Create a new file and associate it with a workflow spec."""
|
|
|
|
file_model = FileModel(
|
2020-03-04 18:40:25 +00:00
|
|
|
workflow_spec_id=workflow_spec.id,
|
2020-02-10 21:19:23 +00:00
|
|
|
name=name,
|
2020-03-13 18:56:46 +00:00
|
|
|
primary=primary,
|
2020-04-17 17:30:32 +00:00
|
|
|
is_status=is_status,
|
2020-02-10 21:19:23 +00:00
|
|
|
)
|
2020-03-04 18:40:25 +00:00
|
|
|
|
2020-02-10 21:19:23 +00:00
|
|
|
return FileService.update_file(file_model, binary_data, content_type)
|
|
|
|
|
2020-05-07 17:57:24 +00:00
|
|
|
@staticmethod
|
|
|
|
def is_allowed_document(code):
|
|
|
|
data_model = FileService.get_reference_file_data(FileService.DOCUMENT_LIST)
|
|
|
|
xls = ExcelFile(data_model.data)
|
|
|
|
df = xls.parse(xls.sheet_names[0])
|
|
|
|
return code in df['code'].values
|
2020-04-17 17:30:32 +00:00
|
|
|
|
2020-02-11 20:03:25 +00:00
|
|
|
@staticmethod
|
|
|
|
def add_form_field_file(study_id, workflow_id, task_id, form_field_key, name, content_type, binary_data):
|
2020-03-19 21:13:30 +00:00
|
|
|
"""Create a new file and associate it with a user task form field within a workflow.
|
|
|
|
Please note that the form_field_key MUST be a known file in the irb_documents.xslx reference document."""
|
2020-05-07 17:57:24 +00:00
|
|
|
if not FileService.is_allowed_document(form_field_key):
|
2020-03-19 21:13:30 +00:00
|
|
|
raise ApiError("invalid_form_field_key",
|
|
|
|
"When uploading files, the form field id must match a known document in the "
|
|
|
|
"irb_docunents.xslx reference file. This code is not found in that file '%s'" % form_field_key)
|
|
|
|
|
2020-02-11 20:03:25 +00:00
|
|
|
file_model = FileModel(
|
|
|
|
study_id=study_id,
|
|
|
|
workflow_id=workflow_id,
|
|
|
|
task_id=task_id,
|
|
|
|
name=name,
|
2020-03-19 21:13:30 +00:00
|
|
|
form_field_key=form_field_key,
|
|
|
|
irb_doc_code=form_field_key
|
2020-02-11 20:03:25 +00:00
|
|
|
)
|
|
|
|
return FileService.update_file(file_model, binary_data, content_type)
|
|
|
|
|
2020-02-10 21:19:23 +00:00
|
|
|
@staticmethod
|
2020-05-07 17:57:24 +00:00
|
|
|
def get_reference_data(reference_file_name, index_column, int_columns=[]):
|
|
|
|
""" Opens a reference file (assumes that it is xls file) and returns the data as a
|
|
|
|
dictionary, each row keyed on the given index_column name. If there are columns
|
|
|
|
that should be represented as integers, pass these as an array of int_columns, lest
|
|
|
|
you get '1.0' rather than '1' """
|
|
|
|
data_model = FileService.get_reference_file_data(reference_file_name)
|
2020-03-19 21:13:30 +00:00
|
|
|
xls = ExcelFile(data_model.data)
|
|
|
|
df = xls.parse(xls.sheet_names[0])
|
2020-05-07 17:57:24 +00:00
|
|
|
for c in int_columns:
|
|
|
|
df[c] = df[c].fillna(0)
|
|
|
|
df = df.astype({c: 'Int64'})
|
2020-04-08 17:28:43 +00:00
|
|
|
df = df.fillna('')
|
|
|
|
df = df.applymap(str)
|
2020-05-07 17:57:24 +00:00
|
|
|
df = df.set_index(index_column)
|
2020-04-08 17:28:43 +00:00
|
|
|
return json.loads(df.to_json(orient='index'))
|
2020-03-19 21:13:30 +00:00
|
|
|
|
|
|
|
@staticmethod
|
Refactor the document details scripts. Now there is one script, it returns data in a consistent format, and has all the details required. The script is located in StudyInfo, with the argument documents. Make note that it returns a dictionary of ALL the documents, with a field to mark which ones are required according to the protocol builder. Others may become required if a workflow determines such, in which case the workflow will enforce this, and the document will have a count > 0, and additional details in a list of files within the document. I modified the XLS file to use lower case variable names, because it disturbed me, and we have to reference them frequently. Removed devious "as_object" variable on get_required_docs, so it behaves like the other methods all the time, and returns a dictionary. All the core business logic for finding the documents list now resides in the StudyService.
Because this changes the endpoint for all existing document details, I've modified all the test and static bpmn files to use the new format.
Shorting up the SponsorsList.xls file makes for slightly faster tests. seems senseless to load 5000 everytime we reset the data.
Tried to test all of this carefully in the test_study_details_documents.py test.
2020-04-29 19:08:11 +00:00
|
|
|
def add_task_file(study_id, workflow_id, workflow_spec_id, task_id, name, content_type, binary_data,
|
2020-03-19 21:13:30 +00:00
|
|
|
irb_doc_code=None):
|
2020-02-10 21:19:23 +00:00
|
|
|
"""Create a new file and associate it with an executing task within a workflow."""
|
|
|
|
file_model = FileModel(
|
|
|
|
study_id=study_id,
|
|
|
|
workflow_id=workflow_id,
|
Refactor the document details scripts. Now there is one script, it returns data in a consistent format, and has all the details required. The script is located in StudyInfo, with the argument documents. Make note that it returns a dictionary of ALL the documents, with a field to mark which ones are required according to the protocol builder. Others may become required if a workflow determines such, in which case the workflow will enforce this, and the document will have a count > 0, and additional details in a list of files within the document. I modified the XLS file to use lower case variable names, because it disturbed me, and we have to reference them frequently. Removed devious "as_object" variable on get_required_docs, so it behaves like the other methods all the time, and returns a dictionary. All the core business logic for finding the documents list now resides in the StudyService.
Because this changes the endpoint for all existing document details, I've modified all the test and static bpmn files to use the new format.
Shorting up the SponsorsList.xls file makes for slightly faster tests. seems senseless to load 5000 everytime we reset the data.
Tried to test all of this carefully in the test_study_details_documents.py test.
2020-04-29 19:08:11 +00:00
|
|
|
workflow_spec_id=workflow_spec_id,
|
2020-02-10 21:19:23 +00:00
|
|
|
task_id=task_id,
|
|
|
|
name=name,
|
2020-03-19 21:13:30 +00:00
|
|
|
irb_doc_code=irb_doc_code
|
2020-02-10 21:19:23 +00:00
|
|
|
)
|
|
|
|
return FileService.update_file(file_model, binary_data, content_type)
|
|
|
|
|
2020-03-13 19:03:57 +00:00
|
|
|
@staticmethod
|
|
|
|
def add_reference_file(name, content_type, binary_data):
|
|
|
|
"""Create a file with the given name, but not associated with a spec or workflow.
|
|
|
|
Only one file with the given reference name can exist."""
|
2020-03-19 14:40:07 +00:00
|
|
|
file_model = session.query(FileModel). \
|
|
|
|
filter(FileModel.is_reference == True). \
|
|
|
|
filter(FileModel.name == name).first()
|
|
|
|
if not file_model:
|
|
|
|
file_model = FileModel(
|
|
|
|
name=name,
|
|
|
|
is_reference=True
|
|
|
|
)
|
2020-03-13 19:03:57 +00:00
|
|
|
return FileService.update_file(file_model, binary_data, content_type)
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def get_extension(file_name):
|
|
|
|
basename, file_extension = os.path.splitext(file_name)
|
|
|
|
return file_extension.lower().strip()[1:]
|
|
|
|
|
2020-02-10 21:19:23 +00:00
|
|
|
@staticmethod
|
|
|
|
def update_file(file_model, binary_data, content_type):
|
|
|
|
|
2020-05-07 17:57:24 +00:00
|
|
|
file_data_model = session.query(FileDataModel). \
|
2020-03-04 18:40:25 +00:00
|
|
|
filter_by(file_model_id=file_model.id,
|
|
|
|
version=file_model.latest_version
|
|
|
|
).with_for_update().first()
|
|
|
|
md5_checksum = UUID(hashlib.md5(binary_data).hexdigest())
|
2020-05-07 17:57:24 +00:00
|
|
|
if (file_data_model is not None and md5_checksum == file_data_model.md5_hash):
|
2020-03-04 18:40:25 +00:00
|
|
|
# This file does not need to be updated, it's the same file.
|
|
|
|
return file_model
|
2020-02-10 21:19:23 +00:00
|
|
|
|
|
|
|
# Verify the extension
|
2020-03-13 19:03:57 +00:00
|
|
|
file_extension = FileService.get_extension(file_model.name)
|
2020-02-10 21:19:23 +00:00
|
|
|
if file_extension not in FileType._member_names_:
|
2020-03-13 19:03:57 +00:00
|
|
|
raise ApiError('unknown_extension',
|
|
|
|
'The file you provided does not have an accepted extension:' +
|
|
|
|
file_extension, status_code=404)
|
2020-02-10 21:19:23 +00:00
|
|
|
else:
|
|
|
|
file_model.type = FileType[file_extension]
|
2020-03-04 18:40:25 +00:00
|
|
|
file_model.content_type = content_type
|
2020-02-10 21:19:23 +00:00
|
|
|
|
|
|
|
if file_data_model is None:
|
2020-03-04 18:40:25 +00:00
|
|
|
version = 1
|
2020-02-10 21:19:23 +00:00
|
|
|
else:
|
2020-03-04 18:40:25 +00:00
|
|
|
version = file_data_model.version + 1
|
|
|
|
|
2020-04-17 17:30:32 +00:00
|
|
|
# If this is a BPMN, extract the process id.
|
|
|
|
if file_model.type == FileType.bpmn:
|
|
|
|
bpmn: ElementTree.Element = ElementTree.fromstring(binary_data)
|
|
|
|
file_model.primary_process_id = WorkflowProcessor.get_process_id(bpmn)
|
|
|
|
|
2020-03-04 18:40:25 +00:00
|
|
|
file_model.latest_version = version
|
|
|
|
file_data_model = FileDataModel(data=binary_data, file_model=file_model, version=version,
|
2020-04-08 16:58:55 +00:00
|
|
|
md5_hash=md5_checksum, last_updated=datetime.now())
|
2020-02-10 21:19:23 +00:00
|
|
|
|
|
|
|
session.add_all([file_model, file_data_model])
|
|
|
|
session.commit()
|
|
|
|
session.flush() # Assure the id is set on the model before returning it.
|
|
|
|
return file_model
|
|
|
|
|
|
|
|
@staticmethod
|
2020-03-13 19:03:57 +00:00
|
|
|
def get_files(workflow_spec_id=None,
|
|
|
|
study_id=None, workflow_id=None, task_id=None, form_field_key=None,
|
2020-03-19 21:13:30 +00:00
|
|
|
name=None, is_reference=False, irb_doc_code=None):
|
2020-03-13 19:03:57 +00:00
|
|
|
query = session.query(FileModel).filter_by(is_reference=is_reference)
|
2020-02-10 21:27:57 +00:00
|
|
|
if workflow_spec_id:
|
|
|
|
query = query.filter_by(workflow_spec_id=workflow_spec_id)
|
|
|
|
if study_id:
|
|
|
|
query = query.filter_by(study_id=study_id)
|
|
|
|
if workflow_id:
|
|
|
|
query = query.filter_by(workflow_id=workflow_id)
|
|
|
|
if task_id:
|
|
|
|
query = query.filter_by(task_id=str(task_id))
|
|
|
|
if form_field_key:
|
|
|
|
query = query.filter_by(form_field_key=form_field_key)
|
2020-03-13 19:03:57 +00:00
|
|
|
if name:
|
2020-04-29 14:21:24 +00:00
|
|
|
query = query.filter_by(name=name)
|
2020-03-19 21:13:30 +00:00
|
|
|
if irb_doc_code:
|
|
|
|
query = query.filter_by(irb_doc_code=irb_doc_code)
|
2020-02-10 21:27:57 +00:00
|
|
|
|
|
|
|
results = query.all()
|
2020-02-10 21:19:23 +00:00
|
|
|
return results
|
|
|
|
|
|
|
|
@staticmethod
|
2020-03-13 19:03:57 +00:00
|
|
|
def get_file_data(file_id, file_model=None):
|
|
|
|
"""Returns the file_data that is associated with the file model id, if an actual file_model
|
|
|
|
is provided, uses that rather than looking it up again."""
|
|
|
|
if file_model is None:
|
|
|
|
file_model = session.query(FileModel).filter(FileModel.id == file_id).first()
|
|
|
|
return session.query(FileDataModel) \
|
|
|
|
.filter(FileDataModel.file_model_id == file_id) \
|
|
|
|
.filter(FileDataModel.version == file_model.latest_version) \
|
2020-03-04 18:40:25 +00:00
|
|
|
.first()
|
2020-03-13 19:03:57 +00:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def get_reference_file_data(file_name):
|
|
|
|
file_model = session.query(FileModel). \
|
|
|
|
filter(FileModel.is_reference == True). \
|
|
|
|
filter(FileModel.name == file_name).first()
|
|
|
|
if not file_model:
|
|
|
|
raise ApiError("file_not_found", "There is no reference file with the name '%s'" % file_name)
|
|
|
|
return FileService.get_file_data(file_model.id, file_model)
|
2020-04-15 15:13:32 +00:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def get_workflow_file_data(workflow, file_name):
|
2020-04-22 19:37:02 +00:00
|
|
|
"""Given a SPIFF Workflow Model, tracks down a file with the given name in the database and returns it's data"""
|
2020-04-19 19:14:10 +00:00
|
|
|
workflow_spec_model = FileService.find_spec_model_in_db(workflow)
|
2020-04-15 15:13:32 +00:00
|
|
|
|
|
|
|
if workflow_spec_model is None:
|
|
|
|
raise ApiError(code="workflow_model_error",
|
|
|
|
message="Something is wrong. I can't find the workflow you are using.")
|
|
|
|
|
|
|
|
file_data_model = session.query(FileDataModel) \
|
|
|
|
.join(FileModel) \
|
|
|
|
.filter(FileModel.name == file_name) \
|
|
|
|
.filter(FileModel.workflow_spec_id == workflow_spec_model.id).first()
|
|
|
|
|
|
|
|
if file_data_model is None:
|
|
|
|
raise ApiError(code="file_missing",
|
|
|
|
message="Can not find a file called '%s' within workflow specification '%s'"
|
|
|
|
% (file_name, workflow_spec_model.id))
|
|
|
|
|
|
|
|
return file_data_model
|
|
|
|
|
|
|
|
@staticmethod
|
2020-04-19 19:14:10 +00:00
|
|
|
def find_spec_model_in_db(workflow):
|
2020-04-15 15:13:32 +00:00
|
|
|
""" Search for the workflow """
|
|
|
|
# When the workflow spec model is created, we record the primary process id,
|
|
|
|
# then we can look it up. As there is the potential for sub-workflows, we
|
|
|
|
# may need to travel up to locate the primary process.
|
|
|
|
spec = workflow.spec
|
2020-04-17 17:30:32 +00:00
|
|
|
workflow_model = session.query(WorkflowSpecModel).join(FileModel). \
|
|
|
|
filter(FileModel.primary_process_id == spec.name).first()
|
2020-04-15 15:13:32 +00:00
|
|
|
if workflow_model is None and workflow != workflow.outer_workflow:
|
2020-04-19 19:14:10 +00:00
|
|
|
return FileService.find_spec_model_in_db(workflow.outer_workflow)
|
2020-04-15 15:13:32 +00:00
|
|
|
|
|
|
|
return workflow_model
|
|
|
|
|
2020-04-24 10:58:24 +00:00
|
|
|
@staticmethod
|
|
|
|
def delete_file(file_id):
|
|
|
|
data_models = session.query(FileDataModel).filter_by(file_model_id=file_id).all()
|
|
|
|
for dm in data_models:
|
|
|
|
lookup_files = session.query(LookupFileModel).filter_by(file_data_model_id=dm.id).all()
|
|
|
|
for lf in lookup_files:
|
|
|
|
session.query(LookupDataModel).filter_by(lookup_file_model_id=lf.id).delete()
|
|
|
|
session.query(LookupFileModel).filter_by(id=lf.id).delete()
|
|
|
|
session.query(FileDataModel).filter_by(file_model_id=file_id).delete()
|
|
|
|
session.query(FileModel).filter_by(id=file_id).delete()
|
|
|
|
session.commit()
|