cr-connect-workflow/crc/scripts/complete_template.py

import copy
from io import BytesIO

import jinja2
from docxtpl import DocxTemplate, Listing

from crc import session
from crc.api.common import ApiError
from crc.models.file import CONTENT_TYPES
from crc.models.workflow import WorkflowModel
from crc.scripts.script import Script
from crc.services.file_service import FileService
from crc.services.workflow_processor import WorkflowProcessor


class CompleteTemplate(Script):

    def get_description(self):
        return """        
Using the Jinja template engine, takes data available in the current task, and uses it to populate 
a word document that contains Jinja markup.  Please see https://docxtpl.readthedocs.io/en/latest/ 
for more information on exact syntax.
Takes two arguments:
1. The name of a MS Word docx file to use as a template.
2. The 'code' of the IRB Document as set in the irb_documents.xlsx file."
"""

    def do_task_validate_only(self, task, study_id, *args, **kwargs):
        """For validation only, process the template, but do not store it in the database."""
        self.process_template(task, study_id, *args, **kwargs)

    def do_task(self, task, study_id, *args, **kwargs):
        workflow_id = task.workflow.data[WorkflowProcessor.WORKFLOW_ID_KEY]
        final_document_stream = self.process_template(task, study_id, *args, **kwargs)
        workflow = session.query(WorkflowModel).filter(WorkflowModel.id == workflow_id).first()
        file_name = args[0]
        irb_doc_code = args[1]
        FileService.add_task_file(study_id=study_id,
                                  workflow_id=workflow_id,
                                  workflow_spec_id=workflow.workflow_spec_id,
                                  task_id=task.id,
                                  name=file_name,
                                  content_type=CONTENT_TYPES['docx'],
                                  binary_data=final_document_stream.read(),
                                  irb_doc_code=irb_doc_code)

    def process_template(self, task, study_id, *args, **kwargs):
        """Entry point, mostly worried about wiring it all up."""
        if len(args) != 2:
            raise ApiError(code="missing_argument",
                           message="The CompleteTemplate script requires 2 arguments.  The first argument is "
                                   "the name of the docx template to use.  The second "
                                   "argument is a code for the document, as "
                                   "set in the reference document %s. " % FileService.DOCUMENT_LIST)
        task_study_id = task.workflow.data[WorkflowProcessor.STUDY_ID_KEY]
        file_name = args[0]

        if task_study_id != study_id:
            raise ApiError(code="invalid_argument",
                           message="The given task does not match the given study.")

        file_data_model = FileService.get_workflow_file_data(task.workflow, file_name)
        return self.make_template(BytesIO(file_data_model.data), task.data)


    def make_template(self, binary_stream, context):
        doc = DocxTemplate(binary_stream)
        doc_context = copy.deepcopy(context)
        doc_context = self.rich_text_update(doc_context)
        jinja_env = jinja2.Environment(autoescape=True)
        doc.render(doc_context, jinja_env)
        target_stream = BytesIO()
        doc.save(target_stream)
        target_stream.seek(0) # move to the beginning of the stream.
        return target_stream

    def rich_text_update(self, context):
        """This is a bit of a hack.  If we find that /n characters exist in the data, we want
        these to come out in the final document without requiring someone to predict it in the
        template.  Ideally we would use the 'RichText' feature of the python-docx library, but
        that requires we both escape it here, and in the Docx template.  There is a thing called
        a 'listing' in python-docx library that only requires we use it on the way in, and the
        template doesn't have to think about it.  So running with that for now."""
        # loop through the content, identify anything that has a newline character in it, and
        # wrap that sucker in a 'listing' function.
        if isinstance(context, dict):
            for k, v in context.items():
                context[k] = self.rich_text_update(v)
        elif isinstance(context, list):
            for i in range(len(context)):
                context[i] = self.rich_text_update(context[i])
        elif isinstance(context, str) and '\n' in context:
            return Listing(context)
        return context
Assure that new lines entered in text-fields are correctly added to the final word document. 2020-05-18 15:55:10 +00:00			`import copy`
			`from io import BytesIO`
Provide a script for generating word documents from template files. Refractored file managment into a service to make it easier to programatically add files. Modified the workflow_processor to inject the study_id and workflow_id into the running workflow so that this meta-information is avialable at the task level. 2020-02-10 21:19:23 +00:00
Assure that new lines entered in text-fields are correctly added to the final word document. 2020-05-18 15:55:10 +00:00			`import jinja2`
			`from docxtpl import DocxTemplate, Listing`
Provide a script for generating word documents from template files. Refractored file managment into a service to make it easier to programatically add files. Modified the workflow_processor to inject the study_id and workflow_id into the running workflow so that this meta-information is avialable at the task level. 2020-02-10 21:19:23 +00:00
			`from crc import session`
			`from crc.api.common import ApiError`
Assure that new lines entered in text-fields are correctly added to the final word document. 2020-05-18 15:55:10 +00:00			`from crc.models.file import CONTENT_TYPES`
			`from crc.models.workflow import WorkflowModel`
Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`from crc.scripts.script import Script`
Renaming to snake case for consistency 2020-02-28 16:54:11 +00:00			`from crc.services.file_service import FileService`
Provide a script for generating word documents from template files. Refractored file managment into a service to make it easier to programatically add files. Modified the workflow_processor to inject the study_id and workflow_id into the running workflow so that this meta-information is avialable at the task level. 2020-02-10 21:19:23 +00:00			`from crc.services.workflow_processor import WorkflowProcessor`


Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`class CompleteTemplate(Script):`

			`def get_description(self):`
Assure that files uploaded through web forms and files generated from templates can be cross-referenced to known document requirements from the protocol builder. Configurators can control this by managing an XLS Spreadsheet called "irb_documents.xslx". Required Documents is becoming complicated, so making this it's own script task, removing it from study_info.py The file_service is now very aware of this irb_documents file, so it will always need to exist. We seed this file during setup, but it can be overwritten by the configurator. 2020-03-19 21:13:30 +00:00			`return """`
			`Using the Jinja template engine, takes data available in the current task, and uses it to populate`
			`a word document that contains Jinja markup. Please see https://docxtpl.readthedocs.io/en/latest/`
			`for more information on exact syntax.`
			`Takes two arguments:`
			`1. The name of a MS Word docx file to use as a template.`
			`2. The 'code' of the IRB Document as set in the irb_documents.xlsx file."`
			`"""`
Provide a script for generating word documents from template files. Refractored file managment into a service to make it easier to programatically add files. Modified the workflow_processor to inject the study_id and workflow_id into the running workflow so that this meta-information is avialable at the task level. 2020-02-10 21:19:23 +00:00
Vastly more informative ApiError model that provides details on the underlying task where the error occured. Added a validate_workflow_specification endpoint that allows you to check if the workflow will execute from beginning to end using random data. Minor fixes to existing bpmns to allow them to pass. All scripts must include a "do_task_validate_only" that restricts external calls and database modifications, but performs as much logic as possible. 2020-03-27 12:29:31 +00:00			`def do_task_validate_only(self, task, study_id, args, *kwargs):`
			`"""For validation only, process the template, but do not store it in the database."""`
			`self.process_template(task, study_id, args, *kwargs)`

Standardizing the script tasks that can be executed on the server, adding tons of error messages for when things go wrong. All scripts must exist in side of the crc/scripts directory. Adding a new script that script tasks can use to add in data about the study. Moving all the test workflow specifications out of the main load. fixing a pile of tests so they can find workflow specs that are now moved into the test directory. 2020-03-03 18:50:22 +00:00			`def do_task(self, task, study_id, args, *kwargs):`
Vastly more informative ApiError model that provides details on the underlying task where the error occured. Added a validate_workflow_specification endpoint that allows you to check if the workflow will execute from beginning to end using random data. Minor fixes to existing bpmns to allow them to pass. All scripts must include a "do_task_validate_only" that restricts external calls and database modifications, but performs as much logic as possible. 2020-03-27 12:29:31 +00:00			`workflow_id = task.workflow.data[WorkflowProcessor.WORKFLOW_ID_KEY]`
			`final_document_stream = self.process_template(task, study_id, args, *kwargs)`
Refactor the document details scripts. Now there is one script, it returns data in a consistent format, and has all the details required. The script is located in StudyInfo, with the argument documents. Make note that it returns a dictionary of ALL the documents, with a field to mark which ones are required according to the protocol builder. Others may become required if a workflow determines such, in which case the workflow will enforce this, and the document will have a count > 0, and additional details in a list of files within the document. I modified the XLS file to use lower case variable names, because it disturbed me, and we have to reference them frequently. Removed devious "as_object" variable on get_required_docs, so it behaves like the other methods all the time, and returns a dictionary. All the core business logic for finding the documents list now resides in the StudyService. Because this changes the endpoint for all existing document details, I've modified all the test and static bpmn files to use the new format. Shorting up the SponsorsList.xls file makes for slightly faster tests. seems senseless to load 5000 everytime we reset the data. Tried to test all of this carefully in the test_study_details_documents.py test. 2020-04-29 19:08:11 +00:00			`workflow = session.query(WorkflowModel).filter(WorkflowModel.id == workflow_id).first()`
Vastly more informative ApiError model that provides details on the underlying task where the error occured. Added a validate_workflow_specification endpoint that allows you to check if the workflow will execute from beginning to end using random data. Minor fixes to existing bpmns to allow them to pass. All scripts must include a "do_task_validate_only" that restricts external calls and database modifications, but performs as much logic as possible. 2020-03-27 12:29:31 +00:00			`file_name = args[0]`
			`irb_doc_code = args[1]`
Functional multi-instance - works with no changes to the front end - though I've added some attributes to task so we could give people a sense of how many iterations they will go through. 2020-04-19 19:14:10 +00:00			`FileService.add_task_file(study_id=study_id,`
			`workflow_id=workflow_id,`
Refactor the document details scripts. Now there is one script, it returns data in a consistent format, and has all the details required. The script is located in StudyInfo, with the argument documents. Make note that it returns a dictionary of ALL the documents, with a field to mark which ones are required according to the protocol builder. Others may become required if a workflow determines such, in which case the workflow will enforce this, and the document will have a count > 0, and additional details in a list of files within the document. I modified the XLS file to use lower case variable names, because it disturbed me, and we have to reference them frequently. Removed devious "as_object" variable on get_required_docs, so it behaves like the other methods all the time, and returns a dictionary. All the core business logic for finding the documents list now resides in the StudyService. Because this changes the endpoint for all existing document details, I've modified all the test and static bpmn files to use the new format. Shorting up the SponsorsList.xls file makes for slightly faster tests. seems senseless to load 5000 everytime we reset the data. Tried to test all of this carefully in the test_study_details_documents.py test. 2020-04-29 19:08:11 +00:00			`workflow_spec_id=workflow.workflow_spec_id,`
Functional multi-instance - works with no changes to the front end - though I've added some attributes to task so we could give people a sense of how many iterations they will go through. 2020-04-19 19:14:10 +00:00			`task_id=task.id,`
Vastly more informative ApiError model that provides details on the underlying task where the error occured. Added a validate_workflow_specification endpoint that allows you to check if the workflow will execute from beginning to end using random data. Minor fixes to existing bpmns to allow them to pass. All scripts must include a "do_task_validate_only" that restricts external calls and database modifications, but performs as much logic as possible. 2020-03-27 12:29:31 +00:00			`name=file_name,`
			`content_type=CONTENT_TYPES['docx'],`
			`binary_data=final_document_stream.read(),`
			`irb_doc_code=irb_doc_code)`

			`def process_template(self, task, study_id, args, *kwargs):`
Provide a script for generating word documents from template files. Refractored file managment into a service to make it easier to programatically add files. Modified the workflow_processor to inject the study_id and workflow_id into the running workflow so that this meta-information is avialable at the task level. 2020-02-10 21:19:23 +00:00			`"""Entry point, mostly worried about wiring it all up."""`
Assure that files uploaded through web forms and files generated from templates can be cross-referenced to known document requirements from the protocol builder. Configurators can control this by managing an XLS Spreadsheet called "irb_documents.xslx". Required Documents is becoming complicated, so making this it's own script task, removing it from study_info.py The file_service is now very aware of this irb_documents file, so it will always need to exist. We seed this file during setup, but it can be overwritten by the configurator. 2020-03-19 21:13:30 +00:00			`if len(args) != 2:`
Provide a script for generating word documents from template files. Refractored file managment into a service to make it easier to programatically add files. Modified the workflow_processor to inject the study_id and workflow_id into the running workflow so that this meta-information is avialable at the task level. 2020-02-10 21:19:23 +00:00			`raise ApiError(code="missing_argument",`
Assure that files uploaded through web forms and files generated from templates can be cross-referenced to known document requirements from the protocol builder. Configurators can control this by managing an XLS Spreadsheet called "irb_documents.xslx". Required Documents is becoming complicated, so making this it's own script task, removing it from study_info.py The file_service is now very aware of this irb_documents file, so it will always need to exist. We seed this file during setup, but it can be overwritten by the configurator. 2020-03-19 21:13:30 +00:00			`message="The CompleteTemplate script requires 2 arguments. The first argument is "`
			`"the name of the docx template to use. The second "`
			`"argument is a code for the document, as "`
Adding a new reference file that provides greater details about the investigators related to a study. Improving the study_info script documentation to provide detailed examples of values returned based on arguments. Making the tests a little more targetted and less subject to breaking through better mocks. Allow all tests to pass even when ther protocol builder mock isn't running locally. Removing the duplication of reference files in tests and static, as this seems silly to me at the moment. 2020-05-07 17:57:24 +00:00			`"set in the reference document %s. " % FileService.DOCUMENT_LIST)`
Fixes SonarCloud issue L23. Verifies that study ids match. 2020-03-16 14:37:06 +00:00			`task_study_id = task.workflow.data[WorkflowProcessor.STUDY_ID_KEY]`
Vastly more informative ApiError model that provides details on the underlying task where the error occured. Added a validate_workflow_specification endpoint that allows you to check if the workflow will execute from beginning to end using random data. Minor fixes to existing bpmns to allow them to pass. All scripts must include a "do_task_validate_only" that restricts external calls and database modifications, but performs as much logic as possible. 2020-03-27 12:29:31 +00:00			`file_name = args[0]`
Fixes SonarCloud issue L23. Verifies that study ids match. 2020-03-16 14:37:06 +00:00
			`if task_study_id != study_id:`
			`raise ApiError(code="invalid_argument",`
			`message="The given task does not match the given study.")`
Provide a script for generating word documents from template files. Refractored file managment into a service to make it easier to programatically add files. Modified the workflow_processor to inject the study_id and workflow_id into the running workflow so that this meta-information is avialable at the task level. 2020-02-10 21:19:23 +00:00
Allow configurators to upload xls files into a workflow for defining enumrations of values for dropdown lists in forms. Fixing lots of tests. Found a problem where the documentation for elements was being processed BEFORE data was loaded from a script. There still may be some issues here. Ran into an issue with circular dependencies - handling it with a new workflow_service, and pulling computational logic out of the api_models - it was the right thing to do. 2020-04-15 15:13:32 +00:00			`file_data_model = FileService.get_workflow_file_data(task.workflow, file_name)`
Vastly more informative ApiError model that provides details on the underlying task where the error occured. Added a validate_workflow_specification endpoint that allows you to check if the workflow will execute from beginning to end using random data. Minor fixes to existing bpmns to allow them to pass. All scripts must include a "do_task_validate_only" that restricts external calls and database modifications, but performs as much logic as possible. 2020-03-27 12:29:31 +00:00			`return self.make_template(BytesIO(file_data_model.data), task.data)`

Provide a script for generating word documents from template files. Refractored file managment into a service to make it easier to programatically add files. Modified the workflow_processor to inject the study_id and workflow_id into the running workflow so that this meta-information is avialable at the task level. 2020-02-10 21:19:23 +00:00
New set of "Tools" api endpoints, that provides a way to quickly render markdown or word documents by uploading json data and a template to populate. Improved Error messages / Error processing. You can now just throw an APIError anywhere, and it will be properly serialized and returned. 2020-02-29 22:22:38 +00:00			`def make_template(self, binary_stream, context):`
			`doc = DocxTemplate(binary_stream)`
Assure that new lines entered in text-fields are correctly added to the final word document. 2020-05-18 15:55:10 +00:00			`doc_context = copy.deepcopy(context)`
			`doc_context = self.rich_text_update(doc_context)`
Fixes vulnerability identified by SonarCloud 2020-02-12 16:07:01 +00:00			`jinja_env = jinja2.Environment(autoescape=True)`
Assure that new lines entered in text-fields are correctly added to the final word document. 2020-05-18 15:55:10 +00:00			`doc.render(doc_context, jinja_env)`
Provide a script for generating word documents from template files. Refractored file managment into a service to make it easier to programatically add files. Modified the workflow_processor to inject the study_id and workflow_id into the running workflow so that this meta-information is avialable at the task level. 2020-02-10 21:19:23 +00:00			`target_stream = BytesIO()`
			`doc.save(target_stream)`
			`target_stream.seek(0) # move to the beginning of the stream.`
			`return target_stream`

Assure that new lines entered in text-fields are correctly added to the final word document. 2020-05-18 15:55:10 +00:00			`def rich_text_update(self, context):`
			`"""This is a bit of a hack. If we find that /n characters exist in the data, we want`
			`these to come out in the final document without requiring someone to predict it in the`
			`template. Ideally we would use the 'RichText' feature of the python-docx library, but`
			`that requires we both escape it here, and in the Docx template. There is a thing called`
			`a 'listing' in python-docx library that only requires we use it on the way in, and the`
			`template doesn't have to think about it. So running with that for now."""`
			`# loop through the content, identify anything that has a newline character in it, and`
			`# wrap that sucker in a 'listing' function.`
			`if isinstance(context, dict):`
			`for k, v in context.items():`
			`context[k] = self.rich_text_update(v)`
			`elif isinstance(context, list):`
			`for i in range(len(context)):`
			`context[i] = self.rich_text_update(context[i])`
			`elif isinstance(context, str) and '\n' in context:`
			`return Listing(context)`
			`return context`