Refactoring Reference files to use the lookup table, rather than parsing the results directly out of the spreadsheet, or attempting to cache them.

Adding a DocumentService to clean up the FileService, and get Documents well seperated, as it seems likely be pulled out or seperated in the future, there is now a Documents api file as well, for the same reason. Some other minor changes are just fixing white space to assure our code is linting correctly. I removed _create_study_workflow_approvals from the base test, as we don't use approvals like this anymore.
2021-07-06 13:10:20 -04:00 · 2021-07-06 13:10:20 -04:00 · 1b1a994360
parent fafa79a07d
commit 1b1a994360
20 changed files with 243 additions and 241 deletions
--- a/crc/init.py
+++ b/crc/init.py
@ -55,7 +55,7 @@ def process_waiting_tasks():
    with app.app_context():
        WorkflowService.do_waiting()

-scheduler.add_job(process_waiting_tasks,'interval',minutes=5)
+scheduler.add_job(process_waiting_tasks,'interval',minutes=1)
 scheduler.start()


--- a/crc/api.yml
+++ b/crc/api.yml
@ -82,7 +82,7 @@ paths:
        schema :
          type : integer
    get:
-      operationId: crc.api.file.get_document_directory
+      operationId: crc.api.document.get_document_directory
      summary: Returns a directory of all files for study in a nested structure
      tags:
        - Document Categories
--- a/crc/api/document.py
+++ b/crc/api/document.py
@ -0,0 +1,18 @@
+from crc.models.api_models import DocumentDirectorySchema
+from crc.models.file import File
+from crc.services.document_service import DocumentService
+from crc.services.file_service import FileService
+from crc.services.lookup_service import LookupService
+
+
+def get_document_directory(study_id, workflow_id=None):
+    """
+    return a nested list of files arranged according to the category hierarchy
+    defined in the doc dictionary
+    """
+    file_models = FileService.get_files_for_study(study_id=study_id)
+    doc_dict = DocumentService.get_dictionary()
+    files = (File.from_models(model, FileService.get_file_data(model.id), doc_dict) for model in file_models)
+    directory = DocumentService.get_directory(doc_dict, files, workflow_id)
+
+    return DocumentDirectorySchema(many=True).dump(directory)
--- a/crc/api/file.py
+++ b/crc/api/file.py
@ -7,71 +7,15 @@ from flask import send_file
 from crc import session
 from crc.api.common import ApiError
 from crc.api.user import verify_token
-from crc.models.api_models import DocumentDirectory, DocumentDirectorySchema
 from crc.models.file import FileSchema, FileModel, File, FileModelSchema, FileDataModel, FileType
 from crc.models.workflow import WorkflowSpecModel
+from crc.services.document_service import DocumentService
 from crc.services.file_service import FileService

-
-def ensure_exists(output, categories, expanded):
-    """
-    This is a recursive function, it expects a list of
-    levels with a file object at the end (kinda like duck,duck,duck,goose)
-
-    for each level, it makes sure that level is already in the structure and if it is not
-    it will add it
-
-    function terminates upon getting an entry that is a file object ( or really anything but string)
-    """
-    current_item = categories[0]
-    found = False
-    if isinstance(current_item, str):
-        for item in output:
-            if item.level == current_item:
-                found = True
-                item.filecount = item.filecount + 1
-                item.expanded = expanded | item.expanded
-                ensure_exists(item.children, categories[1:], expanded)
-        if not found:
-            new_level = DocumentDirectory(level=current_item)
-            new_level.filecount = 1
-            new_level.expanded = expanded
-            output.append(new_level)
-            ensure_exists(new_level.children, categories[1:], expanded)
-    else:
-        new_level = DocumentDirectory(file=current_item)
-        new_level.expanded = expanded
-        output.append(new_level)
-
-
-def get_document_directory(study_id, workflow_id=None):
-    """
-    return a nested list of files arranged according to the category hirearchy
-    defined in the doc dictionary
-    """
-    output = []
-    doc_dict = FileService.get_doc_dictionary()
-    file_models = FileService.get_files_for_study(study_id=study_id)
-    files = (to_file_api(model) for model in file_models)
-    for file in files:
-        if file.irb_doc_code in doc_dict:
-            doc_code = doc_dict[file.irb_doc_code]
-        else:
-            doc_code = {'category1': "Unknown", 'category2': '', 'category3': ''}
-        if workflow_id:
-            expand = file.workflow_id == int(workflow_id)
-        else:
-            expand = False
-        print(expand)
-        categories = [x for x in [doc_code['category1'],doc_code['category2'],doc_code['category3'],file] if x != '']
-        ensure_exists(output, categories, expanded=expand)
-    return DocumentDirectorySchema(many=True).dump(output)
-
-
 def to_file_api(file_model):
    """Converts a FileModel object to something we can return via the api"""
    return File.from_models(file_model, FileService.get_file_data(file_model.id),
-                            FileService.get_doc_dictionary())
+                            DocumentService.get_dictionary())


 def get_files(workflow_spec_id=None, workflow_id=None, form_field_key=None,study_id=None):
--- a/crc/models/file.py
+++ b/crc/models/file.py
@ -1,15 +1,14 @@
 import enum
-from typing import cast

-from marshmallow import INCLUDE, EXCLUDE, fields, Schema
+from marshmallow import INCLUDE, EXCLUDE, Schema
 from marshmallow_enum import EnumField
 from marshmallow_sqlalchemy import SQLAlchemyAutoSchema
 from sqlalchemy import func, Index
 from sqlalchemy.dialects.postgresql import UUID
 from sqlalchemy.orm import deferred, relationship
-from crc.models.data_store import DataStoreModel # this is needed by the relationship

 from crc import db, ma
+from crc.models.data_store import DataStoreModel


 class FileType(enum.Enum):
@ -43,7 +42,7 @@ CONTENT_TYPES = {
    "docx":  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "gif": "image/gif",
    "jpg": "image/jpeg",
-    "md" : "text/plain",
+    "md": "text/plain",
    "pdf": "application/pdf",
    "png": "image/png",
    "ppt": "application/vnd.ms-powerpoint",
@ -71,7 +70,6 @@ class FileDataModel(db.Model):
    file_model = db.relationship("FileModel", foreign_keys=[file_model_id])


-
 class FileModel(db.Model):
    __tablename__ = 'file'
    id = db.Column(db.Integer, primary_key=True)
@ -79,18 +77,19 @@ class FileModel(db.Model):
    type = db.Column(db.Enum(FileType))
    is_status = db.Column(db.Boolean)
    content_type = db.Column(db.String)
-    is_reference = db.Column(db.Boolean, nullable=False, default=False) # A global reference file.
-    primary = db.Column(db.Boolean, nullable=False, default=False) # Is this the primary BPMN in a workflow?
-    primary_process_id = db.Column(db.String, nullable=True) # An id in the xml of BPMN documents, critical for primary BPMN.
+    is_reference = db.Column(db.Boolean, nullable=False, default=False)  # A global reference file.
+    primary = db.Column(db.Boolean, nullable=False, default=False)  # Is this the primary BPMN in a workflow?
+    primary_process_id = db.Column(db.String, nullable=True)  # An id in the xml of BPMN documents, for primary BPMN.
    workflow_spec_id = db.Column(db.String, db.ForeignKey('workflow_spec.id'), nullable=True)
    workflow_id = db.Column(db.Integer, db.ForeignKey('workflow.id'), nullable=True)
-    irb_doc_code = db.Column(db.String, nullable=True) # Code reference to the irb_documents.xlsx reference file.
+    irb_doc_code = db.Column(db.String, nullable=True)  # Code reference to the irb_documents.xlsx reference file.
    # A request was made to delete the file, but we can't because there are
    # active approvals or running workflows that depend on it.  So we archive
    # it instead, hide it in the interface.
    is_review = db.Column(db.Boolean, default=False, nullable=True)
    archived = db.Column(db.Boolean, default=False, nullable=False)
-    data_stores = relationship("DataStoreModel", cascade="all,delete", backref="file")
+    data_stores = relationship(DataStoreModel, cascade="all,delete", backref="file")
+

 class File(object):
    @classmethod
@ -107,7 +106,7 @@ class File(object):
        instance.workflow_id = model.workflow_id
        instance.irb_doc_code = model.irb_doc_code
        instance.type = model.type
-        if model.irb_doc_code  and model.irb_doc_code in doc_dictionary:
+        if model.irb_doc_code and model.irb_doc_code in doc_dictionary:
            instance.document = doc_dictionary[model.irb_doc_code]
        else:
            instance.document = {}
@ -147,7 +146,6 @@ class FileSchema(Schema):
    type = EnumField(FileType)


-
 class LookupFileModel(db.Model):
    """Gives us a quick way to tell what kind of lookup is set on a form field.
    Connected to the file data model, so that if a new version of the same file is
@ -158,8 +156,10 @@ class LookupFileModel(db.Model):
    task_spec_id = db.Column(db.String)
    field_id = db.Column(db.String)
    is_ldap = db.Column(db.Boolean)  # Allows us to run an ldap query instead of a db lookup.
+    is_reference = db.Column(db.Boolean)  # For lookup models that are globally referenced.
    file_data_model_id = db.Column(db.Integer, db.ForeignKey('file_data.id'))
-    dependencies = db.relationship("LookupDataModel", lazy="select", backref="lookup_file_model", cascade="all, delete, delete-orphan")
+    dependencies = db.relationship("LookupDataModel", lazy="select", backref="lookup_file_model",
+                                   cascade="all, delete, delete-orphan")


 class LookupDataModel(db.Model):
@ -169,7 +169,7 @@ class LookupDataModel(db.Model):
    value = db.Column(db.String)
    label = db.Column(db.String)
    # In the future, we might allow adding an additional "search" column if we want to search things not in label.
-    data = db.Column(db.JSON) # all data for the row is stored in a json structure here, but not searched presently.
+    data = db.Column(db.JSON)  # all data for the row is stored in a json structure here, but not searched presently.

    # Assure there is a searchable index on the label column, so we can get fast results back.
    # query with:
@ -192,7 +192,7 @@ class LookupDataSchema(SQLAlchemyAutoSchema):
        load_instance = True
        include_relationships = False
        include_fk = False  # Includes foreign keys
-        exclude = ['id'] # Do not include the id field, it should never be used via the API.
+        exclude = ['id']  # Do not include the id field, it should never be used via the API.


 class SimpleFileSchema(ma.Schema):
--- a/crc/scripts/delete_file.py
+++ b/crc/scripts/delete_file.py
@ -2,6 +2,7 @@ from crc import session
 from crc.api.common import ApiError
 from crc.models.file import FileModel
 from crc.scripts.script import Script
+from crc.services.document_service import DocumentService
 from crc.services.file_service import FileService


@ -9,7 +10,7 @@ class DeleteFile(Script):

    @staticmethod
    def process_document_deletion(doc_code, workflow_id, task):
-        if FileService.is_allowed_document(doc_code):
+        if DocumentService.is_allowed_document(doc_code):
            result = session.query(FileModel).filter(
                FileModel.workflow_id == workflow_id, FileModel.irb_doc_code == doc_code).all()
            if isinstance(result, list) and len(result) > 0 and isinstance(result[0], FileModel):
--- a/crc/scripts/file_data_set.py
+++ b/crc/scripts/file_data_set.py
@ -3,6 +3,7 @@ from flask import g
 from crc.api.common import ApiError
 from crc.services.data_store_service import DataStoreBase
 from crc.scripts.script import Script
+from crc.services.document_service import DocumentService
 from crc.services.file_service import FileService


@ -17,17 +18,22 @@ class FileDataSet(Script, DataStoreBase):
        del(kwargs['file_id'])
        return True

-    def validate_kw_args(self,**kwargs):
-        if kwargs.get('key',None) is None:
+    def validate_kw_args(self, **kwargs):
+        if kwargs.get('key', None) is None:
            raise ApiError(code="missing_argument",
-                            message=f"The 'file_data_get' script requires a keyword argument of 'key'")
+                           message=f"The 'file_data_get' script requires a keyword argument of 'key'")
+        if kwargs.get('file_id', None) is None:
+            raise ApiError(code="missing_argument",
+                           message=f"The 'file_data_get' script requires a keyword argument of 'file_id'")
+        if kwargs.get('value', None) is None:
+            raise ApiError(code="missing_argument",
+                           message=f"The 'file_data_get' script requires a keyword argument of 'value'")

-        if kwargs.get('file_id',None) is None:
-            raise ApiError(code="missing_argument",
-                            message=f"The 'file_data_get' script requires a keyword argument of 'file_id'")
-        if kwargs.get('value',None) is None:
-            raise ApiError(code="missing_argument",
-                            message=f"The 'file_data_get' script requires a keyword argument of 'value'")
+        if kwargs['key'] == 'irb_code' and not DocumentService.is_allowed_document(kwargs.get('value')):
+            raise ApiError("invalid_form_field_key",
+                           "When setting an irb_code, the form field id must match a known document in the "
+                           "irb_docunents.xslx reference file.  This code is not found in that file '%s'" %
+                           kwargs.get('value'))

        return True

--- a/crc/scripts/study_info.py
+++ b/crc/scripts/study_info.py
@ -10,6 +10,7 @@ from crc.models.protocol_builder import ProtocolBuilderInvestigatorType
 from crc.models.study import StudyModel, StudySchema
 from crc.api import workflow as workflow_api
 from crc.scripts.script import Script
+from crc.services.document_service import DocumentService
 from crc.services.file_service import FileService
 from crc.services.protocol_builder import ProtocolBuilderService
 from crc.services.study_service import StudyService
@ -168,8 +169,8 @@ Please note this is just a few examples, ALL known document types are returned i
        """For validation only, pretend no results come back from pb"""
        self.check_args(args, 2)
        # Assure the reference file exists (a bit hacky, but we want to raise this error early, and cleanly.)
-        FileService.get_reference_file_data(FileService.DOCUMENT_LIST)
-        FileService.get_reference_file_data(FileService.INVESTIGATOR_LIST)
+        FileService.get_reference_file_data(DocumentService.DOCUMENT_LIST)
+        FileService.get_reference_file_data(StudyService.INVESTIGATOR_LIST)
        # we call the real do_task so we can
        # seed workflow validations with settings from studies in PB Mock
        # in order to test multiple paths thru the workflow
--- a/crc/services/document_service.py
+++ b/crc/services/document_service.py
@ -0,0 +1,92 @@
+from crc.api.common import ApiError
+from crc.models.api_models import DocumentDirectory
+from crc.services.file_service import FileService
+from crc.services.lookup_service import LookupService
+
+
+class DocumentService(object):
+    DOCUMENT_LIST = "irb_documents.xlsx"
+
+    @staticmethod
+    def is_allowed_document(code):
+        doc_dict = DocumentService.get_dictionary()
+        return code in doc_dict
+
+    @staticmethod
+    def verify_doc_dictionary(dd):
+        """
+        We are currently getting structured information from an XLS file, if someone accidentally
+        changes a header we will have problems later, so we will verify we have the headers we need
+        here
+        """
+        required_fields = ['category1', 'category2', 'category3', 'description']
+
+        # we only need to check the first item, as all of the keys should be the same
+        key = list(dd.keys())[0]
+        for field in required_fields:
+            if field not in dd[key].keys():
+                raise ApiError(code="Invalid document list %s" % DocumentService.DOCUMENT_LIST,
+                               message='Please check the headers in %s' % DocumentService.DOCUMENT_LIST)
+
+    @staticmethod
+    def get_dictionary():
+        """Returns a dictionary of document details keyed on the doc_code."""
+        file_data = FileService.get_reference_file_data(DocumentService.DOCUMENT_LIST)
+        lookup_model = LookupService.get_lookup_model_for_file_data(file_data, 'code', 'description')
+        doc_dict = {}
+        for lookup_data in lookup_model.dependencies:
+            doc_dict[lookup_data.value] = lookup_data.data
+        return doc_dict
+
+    @staticmethod
+    def get_directory(doc_dict, files, workflow_id):
+        """Returns a list of directories, hierarchically nested by category, with files at the deepest level.
+        Empty directories are not include."""
+        directory = []
+        if files:
+            for file in files:
+                if file.irb_doc_code in doc_dict:
+                    doc_code = doc_dict[file.irb_doc_code]
+                else:
+                    doc_code = {'category1': "Unknown", 'category2': None, 'category3': None}
+                if workflow_id:
+                    expand = file.workflow_id == int(workflow_id)
+                else:
+                    expand = False
+                print(expand)
+                categories = [x for x in [doc_code['category1'], doc_code['category2'], doc_code['category3'], file] if x]
+                DocumentService.ensure_exists(directory, categories, expanded=expand)
+        return directory
+
+    @staticmethod
+    def ensure_exists(output, categories, expanded):
+        """
+        This is a recursive function, it expects a list of
+        levels with a file object at the end (kinda like duck,duck,duck,goose)
+
+        for each level, it makes sure that level is already in the structure and if it is not
+        it will add it
+
+        function terminates upon getting an entry that is a file object ( or really anything but string)
+        """
+        current_item = categories[0]
+        found = False
+        if isinstance(current_item, str):
+            for item in output:
+                if item.level == current_item:
+                    found = True
+                    item.filecount = item.filecount + 1
+                    item.expanded = expanded | item.expanded
+                    DocumentService.ensure_exists(item.children, categories[1:], expanded)
+            if not found:
+                new_level = DocumentDirectory(level=current_item)
+                new_level.filecount = 1
+                new_level.expanded = expanded
+                output.append(new_level)
+                DocumentService.ensure_exists(new_level.children, categories[1:], expanded)
+            else:
+                print("Found it")
+        else:
+            new_level = DocumentDirectory(file=current_item)
+            new_level.expanded = expanded
+            output.append(new_level)
--- a/crc/services/file_service.py
+++ b/crc/services/file_service.py
@ -10,8 +10,6 @@ from lxml import etree

 from SpiffWorkflow.bpmn.parser.ValidationException import ValidationException
 from lxml.etree import XMLSyntaxError
-from pandas import ExcelFile
-from pandas._libs.missing import NA
 from sqlalchemy import desc
 from sqlalchemy.exc import IntegrityError

@ -38,34 +36,6 @@ def camel_to_snake(camel):


 class FileService(object):
-    """Provides consistent management and rules for storing, retrieving and processing files."""
-    DOCUMENT_LIST = "irb_documents.xlsx"
-    INVESTIGATOR_LIST = "investigators.xlsx"
-
-    __doc_dictionary = None
-
-    @staticmethod
-    def verify_doc_dictionary(dd):
-        """
-        We are currently getting structured information from an XLS file, if someone accidentally
-        changes a header we will have problems later, so we will verify we have the headers we need
-        here
-        """
-        required_fields = ['category1','category2','category3','description']
-
-        # we only need to check the first item, as all of the keys should be the same
-        key = list(dd.keys())[0]
-        for field in required_fields:
-            if field not in dd[key].keys():
-                raise ApiError(code="Invalid document list %s"%FileService.DOCUMENT_LIST,
-                               message='Please check the headers in %s'%FileService.DOCUMENT_LIST)
-
-    @staticmethod
-    def get_doc_dictionary():
-        if not FileService.__doc_dictionary:
-            FileService.__doc_dictionary = FileService.get_reference_data(FileService.DOCUMENT_LIST, 'code', ['id'])
-        FileService.verify_doc_dictionary(FileService.__doc_dictionary)
-        return FileService.__doc_dictionary

    @staticmethod
    def add_workflow_spec_file(workflow_spec: WorkflowSpecModel,
@ -88,10 +58,7 @@ class FileService(object):

            return FileService.update_file(file_model, binary_data, content_type)

-    @staticmethod
-    def is_allowed_document(code):
-        doc_dict = FileService.get_doc_dictionary()
-        return code in doc_dict
+

    @staticmethod
    @cache
@ -104,12 +71,6 @@ class FileService(object):
    def update_irb_code(file_id, irb_doc_code):
        """Create a new file and associate it with the workflow
        Please note that the irb_doc_code MUST be a known file in the irb_documents.xslx reference document."""
-        if not FileService.is_allowed_document(irb_doc_code):
-            raise ApiError("invalid_form_field_key",
-                           "When uploading files, the form field id must match a known document in the "
-                           "irb_docunents.xslx reference file.  This code is not found in that file '%s'" % irb_doc_code)
-
-        """ """
        file_model = session.query(FileModel)\
            .filter(FileModel.id == file_id).first()
        if file_model is None:
@ -137,28 +98,6 @@ class FileService(object):
            )
        return FileService.update_file(file_model, binary_data, content_type)

-    @staticmethod
-    def get_reference_data(reference_file_name, index_column, int_columns=[]):
-        """ Opens a reference file (assumes that it is xls file) and returns the data as a
-        dictionary, each row keyed on the given index_column name. If there are columns
-          that should be represented as integers, pass these as an array of int_columns, lest
-          you get '1.0' rather than '1'
-          fixme: This is stupid stupid slow.  Place it in the database and just check if it is up to date."""
-        data_model = FileService.get_reference_file_data(reference_file_name)
-        xls = ExcelFile(data_model.data, engine='openpyxl')
-        df = xls.parse(xls.sheet_names[0])
-        df = df.convert_dtypes()
-        df = pd.DataFrame(df).dropna(how='all')  # Drop null rows
-        df = pd.DataFrame(df).replace({NA: None})  # replace NA with None.
-
-        for c in int_columns:
-            df[c] = df[c].fillna(0)
-            df = df.astype({c: 'Int64'})
-        df = df.fillna('')
-        df = df.applymap(str)
-        df = df.set_index(index_column)
-        return json.loads(df.to_json(orient='index'))
-
    @staticmethod
    def get_workflow_files(workflow_id):
        """Returns all the file models associated with a running workflow."""
--- a/crc/services/lookup_service.py
+++ b/crc/services/lookup_service.py
@ -12,7 +12,7 @@ from sqlalchemy.sql.functions import GenericFunction
 from crc import db
 from crc.api.common import ApiError
 from crc.models.api_models import Task
-from crc.models.file import FileDataModel, LookupFileModel, LookupDataModel
+from crc.models.file import FileModel, FileDataModel, LookupFileModel, LookupDataModel
 from crc.models.workflow import WorkflowModel, WorkflowSpecDependencyFile
 from crc.services.file_service import FileService
 from crc.services.ldap_service import LdapService
@ -25,11 +25,14 @@ class TSRank(GenericFunction):


 class LookupService(object):
-    """Provides tools for doing lookups for auto-complete fields.
-    This can currently take two forms:
+    """Provides tools for doing lookups for auto-complete fields, and rapid access to any
+    uploaded spreadsheets.
+    This can currently take three forms:
    1) Lookup from spreadsheet data associated with a workflow specification.
       in which case we store the spreadsheet data in a lookup table with full
       text indexing enabled, and run searches against that table.
+    2) Lookup from spreadsheet data associated with a specific file.  This allows us
+       to get a lookup model for a specific file object, such as a reference file.
    2) Lookup from LDAP records.  In which case we call out to an external service
       to pull back detailed records and return them.

@ -44,6 +47,14 @@ class LookupService(object):
        workflow = db.session.query(WorkflowModel).filter(WorkflowModel.id == workflow_id).first()
        return LookupService.__get_lookup_model(workflow, spiff_task.task_spec.name, field.id)

+    @staticmethod
+    def get_lookup_model_for_file_data(file_data: FileDataModel, value_column, label_column):
+        lookup_model = db.session.query(LookupFileModel).filter(LookupFileModel.file_data_model_id == file_data.id).first()
+        if not lookup_model:
+            logging.warning("!!!! Making a very expensive call to update the lookup model.")
+            lookup_model = LookupService.build_lookup_table(file_data, value_column, label_column)
+        return lookup_model
+
    @staticmethod
    def __get_lookup_model(workflow, task_spec_id, field_id):
        lookup_model = db.session.query(LookupFileModel) \
@ -139,7 +150,8 @@ class LookupService(object):
        return lookup_model

    @staticmethod
-    def build_lookup_table(data_model: FileDataModel, value_column, label_column, workflow_spec_id, task_spec_id, field_id):
+    def build_lookup_table(data_model: FileDataModel, value_column, label_column,
+                           workflow_spec_id=None, task_spec_id=None, field_id=None):
        """ In some cases the lookup table can be very large.  This method will add all values to the database
         in a way that can be searched and returned via an api call - rather than sending the full set of
          options along with the form.  It will only open the file and process the options if something has
--- a/crc/services/study_service.py
+++ b/crc/services/study_service.py
@ -22,13 +22,17 @@ from crc.models.study import StudyModel, Study, StudyStatus, Category, WorkflowM
 from crc.models.task_event import TaskEventModel, TaskEvent
 from crc.models.workflow import WorkflowSpecCategoryModel, WorkflowModel, WorkflowSpecModel, WorkflowState, \
    WorkflowStatus, WorkflowSpecDependencyFile
+from crc.services.document_service import DocumentService
 from crc.services.file_service import FileService
 from crc.services.ldap_service import LdapService
+from crc.services.lookup_service import LookupService
 from crc.services.protocol_builder import ProtocolBuilderService
 from crc.services.workflow_processor import WorkflowProcessor

 class StudyService(object):
    """Provides common tools for working with a Study"""
+    """Provides consistent management and rules for storing, retrieving and processing files."""
+    INVESTIGATOR_LIST = "investigators.xlsx"

    @staticmethod
    def get_studies_for_user(user):
@ -77,7 +81,7 @@ class StudyService(object):
        workflow_metas = StudyService._get_workflow_metas(study_id)
        files = FileService.get_files_for_study(study.id)
        files = (File.from_models(model, FileService.get_file_data(model.id),
-                                  FileService.get_doc_dictionary()) for model in files)
+                                  DocumentService.get_dictionary()) for model in files)
        study.files = list(files)
        # Calling this line repeatedly is very very slow.  It creates the
        # master spec and runs it.  Don't execute this for Abandoned studies, as
@ -265,14 +269,14 @@ class StudyService(object):

        # Loop through all known document types, get the counts for those files,
        # and use pb_docs to mark those as required.
-        doc_dictionary = FileService.get_reference_data(FileService.DOCUMENT_LIST, 'code', ['id'])
+        doc_dictionary = DocumentService.get_dictionary()

        documents = {}
        for code, doc in doc_dictionary.items():

-            if ProtocolBuilderService.is_enabled():
+            doc['required'] = False
+            if ProtocolBuilderService.is_enabled() and doc['id']:
                pb_data = next((item for item in pb_docs if int(item['AUXDOCID']) == int(doc['id'])), None)
-                doc['required'] = False
                if pb_data:
                    doc['required'] = True

@ -282,7 +286,7 @@ class StudyService(object):
            # Make a display name out of categories
            name_list = []
            for cat_key in ['category1', 'category2', 'category3']:
-                if doc[cat_key] not in ['', 'NULL']:
+                if doc[cat_key] not in ['', 'NULL', None]:
                    name_list.append(doc[cat_key])
            doc['display_name'] = ' / '.join(name_list)

@ -319,12 +323,22 @@ class StudyService(object):
            documents[code] = doc
        return Box(documents)

+    @staticmethod
+    def get_investigator_dictionary():
+        """Returns a dictionary of document details keyed on the doc_code."""
+        file_data = FileService.get_reference_file_data(StudyService.INVESTIGATOR_LIST)
+        lookup_model = LookupService.get_lookup_model_for_file_data(file_data, 'code', 'label')
+        doc_dict = {}
+        for lookup_data in lookup_model.dependencies:
+            doc_dict[lookup_data.value] = lookup_data.data
+        return doc_dict
+
    @staticmethod
    def get_investigators(study_id, all=False):
        """Convert array of investigators from protocol builder into a dictionary keyed on the type. """

        # Loop through all known investigator types as set in the reference file
-        inv_dictionary = FileService.get_reference_data(FileService.INVESTIGATOR_LIST, 'code')
+        inv_dictionary = StudyService.get_investigator_dictionary()

        # Get PB required docs
        pb_investigators = ProtocolBuilderService.get_investigators(study_id=study_id)
--- a/crc/services/workflow_service.py
+++ b/crc/services/workflow_service.py
@ -30,6 +30,7 @@ from crc.models.study import StudyModel
 from crc.models.task_event import TaskEventModel
 from crc.models.user import UserModel, UserModelSchema
 from crc.models.workflow import WorkflowModel, WorkflowStatus, WorkflowSpecModel
+from crc.services.document_service import DocumentService
 from crc.services.file_service import FileService
 from crc.services.lookup_service import LookupService
 from crc.services.study_service import StudyService
@ -97,12 +98,15 @@ class WorkflowService(object):
    def do_waiting():
        records = db.session.query(WorkflowModel).filter(WorkflowModel.status==WorkflowStatus.waiting).all()
        for workflow_model in records:
-            print('processing workflow %d'%workflow_model.id)
-            processor = WorkflowProcessor(workflow_model)
-            processor.bpmn_workflow.refresh_waiting_tasks()
-            processor.bpmn_workflow.do_engine_steps()
-            processor.save()
-
+            # fixme:  Try catch with a very explicit error about the study, workflow and task that failed.
+            try:
+                app.logger.info('Processing workflow %s' % workflow_model.id)
+                processor = WorkflowProcessor(workflow_model)
+                processor.bpmn_workflow.refresh_waiting_tasks()
+                processor.bpmn_workflow.do_engine_steps()
+                processor.save()
+            except:
+                app.logger.error('Failed to process workflow')

    @staticmethod
    @timeit
@ -424,7 +428,7 @@ class WorkflowService(object):
                doc_code = WorkflowService.evaluate_property('doc_code', field, task)
            file_model = FileModel(name="test.png",
                                   irb_doc_code = field.id)
-            doc_dict = FileService.get_doc_dictionary()
+            doc_dict = DocumentService.get_dictionary()
            file = File.from_models(file_model, None, doc_dict)
            return FileSchema().dump(file)
        elif field.type == 'files':
--- a/example_data.py
+++ b/example_data.py
@ -7,7 +7,9 @@ from crc.models.file import CONTENT_TYPES
 from crc.models.ldap import LdapModel
 from crc.models.user import UserModel
 from crc.models.workflow import WorkflowSpecModel, WorkflowSpecCategoryModel
+from crc.services.document_service import DocumentService
 from crc.services.file_service import FileService
+from crc.services.study_service import StudyService


 class ExampleDataLoader:
@ -315,14 +317,14 @@ class ExampleDataLoader:
    def load_reference_documents(self):
        file_path = os.path.join(app.root_path, 'static', 'reference', 'irb_documents.xlsx')
        file = open(file_path, "rb")
-        FileService.add_reference_file(FileService.DOCUMENT_LIST,
+        FileService.add_reference_file(DocumentService.DOCUMENT_LIST,
                                       binary_data=file.read(),
                                       content_type=CONTENT_TYPES['xls'])
        file.close()

        file_path = os.path.join(app.root_path, 'static', 'reference', 'investigators.xlsx')
        file = open(file_path, "rb")
-        FileService.add_reference_file(FileService.INVESTIGATOR_LIST,
+        FileService.add_reference_file(StudyService.INVESTIGATOR_LIST,
                                       binary_data=file.read(),
                                       content_type=CONTENT_TYPES['xls'])
        file.close()
--- a/tests/base_test.py
+++ b/tests/base_test.py
@ -2,6 +2,7 @@
 # IMPORTANT - Environment must be loaded before app, models, etc....
 import os

+
 os.environ["TESTING"] = "true"

 import json
@ -23,6 +24,7 @@ from crc.services.file_service import FileService
 from crc.services.study_service import StudyService
 from crc.services.user_service import UserService
 from crc.services.workflow_service import WorkflowService
+from crc.services.document_service import DocumentService
 from example_data import ExampleDataLoader

 # UNCOMMENT THIS FOR DEBUGGING SQL ALCHEMY QUERIES
@ -138,8 +140,7 @@ class BaseTest(unittest.TestCase):
        delete everything that matters in the local database - this is used to
        test ground zero copy of workflow specs.
        """
-        session.execute("delete from workflow; delete from file_data; delete from file; delete from workflow_spec;")
-        session.commit()
+        ExampleDataLoader.clean_db()

    def load_example_data(self, use_crc_data=False, use_rrt_data=False):
        """use_crc_data will cause this to load the mammoth collection of documents
@ -282,28 +283,6 @@ class BaseTest(unittest.TestCase):
            session.commit()
        return study

-    def _create_study_workflow_approvals(self, user_uid, title, primary_investigator_id, approver_uids, statuses,
-                                         workflow_spec_name="random_fact"):
-        study = self.create_study(uid=user_uid, title=title, primary_investigator_id=primary_investigator_id)
-        workflow = self.create_workflow(workflow_name=workflow_spec_name, study=study)
-        approvals = []
-
-        for i in range(len(approver_uids)):
-            approvals.append(self.create_approval(
-                study=study,
-                workflow=workflow,
-                approver_uid=approver_uids[i],
-                status=statuses[i],
-                version=1
-            ))
-
-        full_study = {
-            'study': study,
-            'workflow': workflow,
-            'approvals': approvals,
-        }
-
-        return full_study

    def create_workflow(self, workflow_name, display_name=None, study=None, category_id=None, as_user="dhf8r"):
        session.flush()
@ -320,30 +299,11 @@ class BaseTest(unittest.TestCase):
    def create_reference_document(self):
        file_path = os.path.join(app.root_path, 'static', 'reference', 'irb_documents.xlsx')
        file = open(file_path, "rb")
-        FileService.add_reference_file(FileService.DOCUMENT_LIST,
+        FileService.add_reference_file(DocumentService.DOCUMENT_LIST,
                                       binary_data=file.read(),
-                                       content_type=CONTENT_TYPES['xls'])
+                                       content_type=CONTENT_TYPES['xlsx'])
        file.close()

-    def create_approval(
-            self,
-            study=None,
-            workflow=None,
-            approver_uid=None,
-            status=None,
-            version=None,
-    ):
-        study = study or self.create_study()
-        workflow = workflow or self.create_workflow()
-        approver_uid = approver_uid or self.test_uid
-        status = status or ApprovalStatus.PENDING.value
-        version = version or 1
-        approval = ApprovalModel(study=study, workflow=workflow, approver_uid=approver_uid, status=status,
-                                 version=version)
-        session.add(approval)
-        session.commit()
-        return approval
-
    def get_workflow_common(self, url, user):
        rv = self.app.get(url,
                          headers=self.logged_in_headers(user),
--- a/tests/data/file_upload_form/file_upload_form.bpmn
+++ b/tests/data/file_upload_form/file_upload_form.bpmn
@ -16,6 +16,12 @@
 OGC will upload the Non-Funded Executed Agreement after it has been negotiated by OSP contract negotiator.</bpmn:documentation>
      <bpmn:extensionElements>
        <camunda:formData>
+          <camunda:formField id="Date" label="Version Date" type="date">
+            <camunda:properties>
+              <camunda:property id="group" value="PCRApproval" />
+              <camunda:property id="file_data" value="Some_File" />
+            </camunda:properties>
+          </camunda:formField>
          <camunda:formField id="file_type" type="enum" defaultValue="AD_CoCApp">
            <camunda:value id="AD_CoCApp" name="Ancillary Documents / Case Report Form" />
            <camunda:value id="AD_CoCAppr" name="Ancillary Documents / CoC Approval" />
@ -32,12 +38,6 @@ OGC will upload the Non-Funded Executed Agreement after it has been negotiated b
              <camunda:property id="file_data" value="Some_File" />
            </camunda:properties>
          </camunda:formField>
-          <camunda:formField id="Date" label="Version Date" type="date">
-            <camunda:properties>
-              <camunda:property id="group" value="PCRApproval" />
-              <camunda:property id="file_data" value="Some_File" />
-            </camunda:properties>
-          </camunda:formField>
        </camunda:formData>
      </bpmn:extensionElements>
      <bpmn:incoming>SequenceFlow_0ea9hvd</bpmn:incoming>
@ -67,4 +67,4 @@ OGC will upload the Non-Funded Executed Agreement after it has been negotiated b
      </bpmndi:BPMNShape>
    </bpmndi:BPMNPlane>
  </bpmndi:BPMNDiagram>
-</bpmn:definitions>
+</bpmn:definitions>
--- a/tests/files/test_files_api.py
+++ b/tests/files/test_files_api.py
@ -1,14 +1,16 @@
 import io
 import json
+import os

 from tests.base_test import BaseTest

-from crc import session, db
+from crc import session, db, app
 from crc.models.file import FileModel, FileType, FileSchema, FileModelSchema
 from crc.models.workflow import WorkflowSpecModel
 from crc.services.file_service import FileService
 from crc.services.workflow_processor import WorkflowProcessor
 from crc.models.data_store import DataStoreModel
+from crc.services.document_service import DocumentService
 from example_data import ExampleDataLoader


@ -110,20 +112,23 @@ class TestFilesApi(BaseTest):
        self.assertEqual(0, len(json.loads(rv.get_data(as_text=True))))

    def test_set_reference_file(self):
-        file_name = "irb_document_types.xls"
-        data = {'file': (io.BytesIO(b"abcdef"), "does_not_matter.xls")}
+        file_name = "irb_documents.xlsx"
+        filepath = os.path.join(app.root_path, 'static', 'reference', 'irb_documents.xlsx')
+        with open(filepath, 'rb') as myfile:
+            file_data = myfile.read()
+        data = {'file': (io.BytesIO(file_data), file_name)}
        rv = self.app.put('/v1.0/reference_file/%s' % file_name, data=data, follow_redirects=True,
                          content_type='multipart/form-data', headers=self.logged_in_headers())
        self.assert_success(rv)
        self.assertIsNotNone(rv.get_data())
        json_data = json.loads(rv.get_data(as_text=True))
        file = FileModelSchema().load(json_data, session=session)
-        self.assertEqual(FileType.xls, file.type)
+        self.assertEqual(FileType.xlsx, file.type)
        self.assertTrue(file.is_reference)
-        self.assertEqual("application/vnd.ms-excel", file.content_type)
+        self.assertEqual("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", file.content_type)

    def test_set_reference_file_bad_extension(self):
-        file_name = FileService.DOCUMENT_LIST
+        file_name = DocumentService.DOCUMENT_LIST
        data = {'file': (io.BytesIO(b"abcdef"), "does_not_matter.ppt")}
        rv = self.app.put('/v1.0/reference_file/%s' % file_name, data=data, follow_redirects=True,
                          content_type='multipart/form-data', headers=self.logged_in_headers())
@ -131,22 +136,28 @@ class TestFilesApi(BaseTest):

    def test_get_reference_file(self):
        file_name = "irb_document_types.xls"
-        data = {'file': (io.BytesIO(b"abcdef"), "some crazy thing do not care.xls")}
+        filepath = os.path.join(app.root_path, 'static', 'reference', 'irb_documents.xlsx')
+        with open(filepath, 'rb') as myfile:
+            file_data = myfile.read()
+        data = {'file': (io.BytesIO(file_data), file_name)}
        rv = self.app.put('/v1.0/reference_file/%s' % file_name, data=data, follow_redirects=True,
                          content_type='multipart/form-data', headers=self.logged_in_headers())
        rv = self.app.get('/v1.0/reference_file/%s' % file_name, headers=self.logged_in_headers())
        self.assert_success(rv)
        data_out = rv.get_data()
-        self.assertEqual(b"abcdef", data_out)
+        self.assertEqual(file_data, data_out)

    def test_list_reference_files(self):
        ExampleDataLoader.clean_db()

-        file_name = FileService.DOCUMENT_LIST
-        data = {'file': (io.BytesIO(b"abcdef"), file_name)}
+        file_name = DocumentService.DOCUMENT_LIST
+        filepath = os.path.join(app.root_path, 'static', 'reference', 'irb_documents.xlsx')
+        with open(filepath, 'rb') as myfile:
+            file_data = myfile.read()
+        data = {'file': (io.BytesIO(file_data), file_name)}
        rv = self.app.put('/v1.0/reference_file/%s' % file_name, data=data, follow_redirects=True,
                          content_type='multipart/form-data', headers=self.logged_in_headers())
-
+        self.assert_success(rv)
        rv = self.app.get('/v1.0/reference_file',
                          follow_redirects=True,
                          content_type="application/json", headers=self.logged_in_headers())
@ -159,7 +170,8 @@ class TestFilesApi(BaseTest):

    def test_update_file_info(self):
        self.load_example_data()
-        file: FileModel = session.query(FileModel).first()
+        self.create_reference_document()
+        file: FileModel = session.query(FileModel).filter(FileModel.is_reference==False).first()
        file.name = "silly_new_name.bpmn"

        rv = self.app.put('/v1.0/file/%i' % file.id,
--- a/tests/study/test_study_details_documents.py
+++ b/tests/study/test_study_details_documents.py
@ -1,4 +1,3 @@
-import json

 from SpiffWorkflow.bpmn.PythonScriptEngine import Box

@ -15,6 +14,7 @@ from crc.services.file_service import FileService
 from crc.services.study_service import StudyService
 from crc.services.workflow_processor import WorkflowProcessor
 from crc.scripts.file_data_set import FileDataSet
+from crc.services.document_service import DocumentService


 class TestStudyDetailsDocumentsScript(BaseTest):
@ -43,8 +43,8 @@ class TestStudyDetailsDocumentsScript(BaseTest):

        # Remove the reference file.
        file_model = db.session.query(FileModel). \
-            filter(FileModel.is_reference == True). \
-            filter(FileModel.name == FileService.DOCUMENT_LIST).first()
+            filter(FileModel.is_reference is True). \
+            filter(FileModel.name == DocumentService.DOCUMENT_LIST).first()
        if file_model:
            db.session.query(FileDataModel).filter(FileDataModel.file_model_id == file_model.id).delete()
            db.session.query(FileModel).filter(FileModel.id == file_model.id).delete()
@ -71,7 +71,7 @@ class TestStudyDetailsDocumentsScript(BaseTest):

    def test_load_lookup_data(self):
        self.create_reference_document()
-        dict = FileService.get_reference_data(FileService.DOCUMENT_LIST, 'code', ['id'])
+        dict = DocumentService.get_dictionary()
        self.assertIsNotNone(dict)

    def get_required_docs(self):
--- a/tests/study/test_study_service.py
+++ b/tests/study/test_study_service.py
@ -122,11 +122,11 @@ class TestStudyService(BaseTest):
        self.assertEqual("Cancer Center's PRC Approval Form", documents["UVACompl_PRCAppr"]['description'])
        self.assertEqual("UVA Compliance", documents["UVACompl_PRCAppr"]['category1'])
        self.assertEqual("PRC Approval", documents["UVACompl_PRCAppr"]['category2'])
-        self.assertEqual("", documents["UVACompl_PRCAppr"]['category3'])
+        self.assertEqual(None, documents["UVACompl_PRCAppr"]['category3'])
        self.assertEqual("CRC", documents["UVACompl_PRCAppr"]['Who Uploads?'])
        self.assertEqual(0, documents["UVACompl_PRCAppr"]['count'])
        self.assertEqual(True, documents["UVACompl_PRCAppr"]['required'])
-        self.assertEqual('6', documents["UVACompl_PRCAppr"]['id'])
+        self.assertEqual(6, documents["UVACompl_PRCAppr"]['id'])

    @patch('crc.services.protocol_builder.ProtocolBuilderService.get_required_docs')  # mock_docs
    def test_get_documents_has_file_details(self, mock_docs):
--- a/tests/test_document_directories.py
+++ b/tests/test_document_directories.py
@ -3,9 +3,6 @@ from tests.base_test import BaseTest
 from crc.services.file_service import FileService


-
-
-
 class TestDocumentDirectories(BaseTest):

    def test_directory_list(self):