cr-connect-workflow/crc/scripts/study_info.py

from pandas import ExcelFile

from crc import session, ma
from crc.api.common import ApiError
from crc.models.study import StudyModel, StudyModelSchema
from crc.scripts.script import Script, ScriptValidationError
from crc.services.file_service import FileService
from crc.services.protocol_builder import ProtocolBuilderService


class StudyInfo(Script):
    """Just your basic class that can pull in data from a few api endpoints and do a basic task."""
    pb = ProtocolBuilderService()
    type_options = ['info', 'investigators', 'required_docs', 'details']
    IRB_PRO_CATEGORIES_FILE = "irb_pro_categories.xls"

    def get_description(self):
        return """StudyInfo [TYPE], where TYPE is one of 'info', 'investigators','required_docs', or 'details'
            Adds details about the current study to the Task Data.  The type of information required should be 
            provided as an argument.  Basic returns the basic information such as the title.  Investigators provides
            detailed information about each investigator in th study.  Details provides a large number
            of details about the study, as gathered within the protocol builder, and 'required_docs', 
            lists all the documents the Protocol Builder has determined will be required as a part of
            this study. 
        """

    def do_task(self, task, study_id, *args, **kwargs):
        if len(args) != 1 or (args[0] not in StudyInfo.type_options):
            raise ApiError(code="missing_argument",
                           message="The StudyInfo script requires a single argument which must be "
                                   "one of %s" % ",".join(StudyInfo.type_options))
        cmd = args[0]
        study_info = {}
        if "study" in task.data:
            study_info = task.data["study"]

        if cmd == 'info':
            study = session.query(StudyModel).filter_by(id=study_id).first()
            schema = StudyModelSchema()
            study_info["info"] = schema.dump(study)
        if cmd == 'investigators':
            study_info["investigators"] = self.pb.get_investigators(study_id)
        if cmd == 'required_docs':
            study_info["required_docs"] = self.get_required_docs(study_id)
        if cmd == 'details':
            study_info["details"] = self.pb.get_study_details(study_id)
        task.data["study"] = study_info

    def get_required_docs(self, study_id):
        """Takes data from the protocol builder, and merges it with data from the IRB Pro Categories spreadsheet to return
        pertinant details about the required documents."""
        pb_docs = self.pb.get_required_docs(study_id)
        doc_dictionary = self.get_file_reference_dictionary()
        required_docs = []
        for doc in pb_docs:
            id = int(doc['AUXDOCID'])
            required_doc = {'id': id, 'name': doc['AUXDOC']}
            if id in doc_dictionary:
                required_doc = {**required_doc, **doc_dictionary[id]}
            required_docs.append(required_doc)
        return required_docs


    def get_file_reference_dictionary(self):
        """Loads up the xsl file that contains the IRB Pro Categories and converts it to a Panda's data frame for processing."""
        data_model = FileService.get_reference_file_data(StudyInfo.IRB_PRO_CATEGORIES_FILE)
        xls = ExcelFile(data_model.data)
        df = xls.parse(xls.sheet_names[0])
        # Pandas is lovely, but weird. Here we drop records without an Id, and convert it to an integer.
        df = df.drop_duplicates(subset='Id').astype({'Id': 'Int64'})
        # Now we index on the ID column and convert to a dictionary, where the key is the id, and the value
        #    is a dictionary with all the remaining data in it.  It's kinda pretty really.
        all_dict = df.set_index('Id').to_dict('index')
        return all_dict

    # Verifies that information is available for this script task to function
    # correctly. Returns a list of validation errors.
    @staticmethod
    def validate():
        errors = []
        try:
            FileService.get_reference_file_data(StudyInfo.IRB_PRO_CATEGORIES_FILE)
        except ApiError as ae:
            errors.append(ScriptValidationError.from_api_error(ae))
        return errors

class RequiredDocument(object):
    def __init__(self, pb_id, pb_name, category1, category2, category3, who_uploads, required, total_uploaded):
        self.protocol_builder_id = pb_id
        self.protocol_builder_name = pb_name
        self.category1 = category1
        self.category2 = category2
        self.category3 = category3
        self.who_uploads = who_uploads
        self.required = required
        self.total_uploaded = total_uploaded

class RequiredDocumentSchema(ma.Schema):
    class Meta:
        model = RequiredDocument
        fields = ["pb_id", "pb_name", "category1", "category2", "category3",
                  "who_uploads", "required", "total_uploaded"]
Just merging stuff real quick. 2020-03-18 21:03:36 +00:00			`from pandas import ExcelFile`

			`from crc import session, ma`
Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`from crc.api.common import ApiError`
			`from crc.models.study import StudyModel, StudyModelSchema`
Just merging stuff real quick. 2020-03-18 21:03:36 +00:00			`from crc.scripts.script import Script, ScriptValidationError`
			`from crc.services.file_service import FileService`
Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`from crc.services.protocol_builder import ProtocolBuilderService`


			`class StudyInfo(Script):`
			`"""Just your basic class that can pull in data from a few api endpoints and do a basic task."""`
			`pb = ProtocolBuilderService()`
			`type_options = ['info', 'investigators', 'required_docs', 'details']`
Just merging stuff real quick. 2020-03-18 21:03:36 +00:00			`IRB_PRO_CATEGORIES_FILE = "irb_pro_categories.xls"`
Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00
			`def get_description(self):`
Adding a simple endpoint that describes what scripts are currently available, along with a brief description. 2020-03-03 20:30:42 +00:00			`return """StudyInfo [TYPE], where TYPE is one of 'info', 'investigators','required_docs', or 'details'`
Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`Adds details about the current study to the Task Data. The type of information required should be`
			`provided as an argument. Basic returns the basic information such as the title. Investigators provides`
			`detailed information about each investigator in th study. Details provides a large number`
			`of details about the study, as gathered within the protocol builder, and 'required_docs',`
			`lists all the documents the Protocol Builder has determined will be required as a part of`
			`this study.`
			`"""`

			`def do_task(self, task, study_id, args, *kwargs):`
			`if len(args) != 1 or (args[0] not in StudyInfo.type_options):`
			`raise ApiError(code="missing_argument",`
			`message="The StudyInfo script requires a single argument which must be "`
			`"one of %s" % ",".join(StudyInfo.type_options))`
			`cmd = args[0]`
Fix data being over-written in the study_info. 2020-03-09 16:41:35 +00:00			`study_info = {}`
			`if "study" in task.data:`
			`study_info = task.data["study"]`

Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`if cmd == 'info':`
			`study = session.query(StudyModel).filter_by(id=study_id).first()`
			`schema = StudyModelSchema()`
Fix data being over-written in the study_info. 2020-03-09 16:41:35 +00:00			`study_info["info"] = schema.dump(study)`
Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`if cmd == 'investigators':`
Fix data being over-written in the study_info. 2020-03-09 16:41:35 +00:00			`study_info["investigators"] = self.pb.get_investigators(study_id)`
Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`if cmd == 'required_docs':`
Just merging stuff real quick. 2020-03-18 21:03:36 +00:00			`study_info["required_docs"] = self.get_required_docs(study_id)`
Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`if cmd == 'details':`
Fix data being over-written in the study_info. 2020-03-09 16:41:35 +00:00			`study_info["details"] = self.pb.get_study_details(study_id)`
			`task.data["study"] = study_info`
Fixes #12: Catching some specific common errors and re-raising as APIErrors with detailed codes and descriptions to improve debugging. In doing so, improving the error handling in the event a soft-reset causes an immediate error - and resetting to the original version of the specification in these events, to allow users the chance to try a hard reset instead. 2020-03-11 20:33:18 +00:00
			`def get_required_docs(self, study_id):`
Just merging stuff real quick. 2020-03-18 21:03:36 +00:00			`"""Takes data from the protocol builder, and merges it with data from the IRB Pro Categories spreadsheet to return`
			`pertinant details about the required documents."""`
			`pb_docs = self.pb.get_required_docs(study_id)`
Mergers details from the irb_documents.xlsx into the values returned from the Protocol Builder to create a more complete picture of required document details. 2020-03-19 14:23:50 +00:00			`doc_dictionary = self.get_file_reference_dictionary()`
Just merging stuff real quick. 2020-03-18 21:03:36 +00:00			`required_docs = []`
			`for doc in pb_docs:`
Mergers details from the irb_documents.xlsx into the values returned from the Protocol Builder to create a more complete picture of required document details. 2020-03-19 14:23:50 +00:00			`id = int(doc['AUXDOCID'])`
			`required_doc = {'id': id, 'name': doc['AUXDOC']}`
			`if id in doc_dictionary:`
			`required_doc = {required_doc, doc_dictionary[id]}`
			`required_docs.append(required_doc)`
Fixes code smell issues identified by SonarCloud 2020-03-16 12:31:19 +00:00			`return required_docs`
Fixes #12: Catching some specific common errors and re-raising as APIErrors with detailed codes and descriptions to improve debugging. In doing so, improving the error handling in the event a soft-reset causes an immediate error - and resetting to the original version of the specification in these events, to allow users the chance to try a hard reset instead. 2020-03-11 20:33:18 +00:00
Mergers details from the irb_documents.xlsx into the values returned from the Protocol Builder to create a more complete picture of required document details. 2020-03-19 14:23:50 +00:00
Just merging stuff real quick. 2020-03-18 21:03:36 +00:00			`def get_file_reference_dictionary(self):`
			`"""Loads up the xsl file that contains the IRB Pro Categories and converts it to a Panda's data frame for processing."""`
			`data_model = FileService.get_reference_file_data(StudyInfo.IRB_PRO_CATEGORIES_FILE)`
			`xls = ExcelFile(data_model.data)`
			`df = xls.parse(xls.sheet_names[0])`
Mergers details from the irb_documents.xlsx into the values returned from the Protocol Builder to create a more complete picture of required document details. 2020-03-19 14:23:50 +00:00			`# Pandas is lovely, but weird. Here we drop records without an Id, and convert it to an integer.`
			`df = df.drop_duplicates(subset='Id').astype({'Id': 'Int64'})`
			`# Now we index on the ID column and convert to a dictionary, where the key is the id, and the value`
			`# is a dictionary with all the remaining data in it. It's kinda pretty really.`
			`all_dict = df.set_index('Id').to_dict('index')`
			`return all_dict`
Just merging stuff real quick. 2020-03-18 21:03:36 +00:00
			`# Verifies that information is available for this script task to function`
			`# correctly. Returns a list of validation errors.`
			`@staticmethod`
			`def validate():`
			`errors = []`
			`try:`
			`FileService.get_reference_file_data(StudyInfo.IRB_PRO_CATEGORIES_FILE)`
			`except ApiError as ae:`
			`errors.append(ScriptValidationError.from_api_error(ae))`
			`return errors`

			`class RequiredDocument(object):`
			`def __init__(self, pb_id, pb_name, category1, category2, category3, who_uploads, required, total_uploaded):`
			`self.protocol_builder_id = pb_id`
			`self.protocol_builder_name = pb_name`
			`self.category1 = category1`
			`self.category2 = category2`
			`self.category3 = category3`
			`self.who_uploads = who_uploads`
			`self.required = required`
			`self.total_uploaded = total_uploaded`

			`class RequiredDocumentSchema(ma.Schema):`
			`class Meta:`
			`model = RequiredDocument`
			`fields = ["pb_id", "pb_name", "category1", "category2", "category3",`
			`"who_uploads", "required", "total_uploaded"]`