Create lookup tables for XSL files referenced in workflows so we can do full text searches and populate lists on the fly quickly.

This commit is contained in:
Dan Funk 2020-04-22 15:37:02 -04:00
parent 4ab51fe8f2
commit 6de8c8b977
6 changed files with 148 additions and 16 deletions

View File

@ -20,6 +20,8 @@ class Task(object):
ENUM_OPTIONS_FILE_PROP = "enum.options.file"
EMUM_OPTIONS_VALUE_COL_PROP = "enum.options.value.column"
EMUM_OPTIONS_LABEL_COL_PROP = "enum.options.label.column"
EMUM_OPTIONS_AS_LOOKUP = "enum.options.lookup"
def __init__(self, id, name, title, type, state, form, documentation, data,
mi_type, mi_count, mi_index, properties):

View File

@ -1,8 +1,10 @@
import enum
from typing import cast
from marshmallow_enum import EnumField
from marshmallow_sqlalchemy import SQLAlchemyAutoSchema
from sqlalchemy import func
from sqlalchemy import func, Index, text
from sqlalchemy.dialects import postgresql
from sqlalchemy.dialects.postgresql import UUID
from crc import db
@ -91,3 +93,42 @@ class FileModelSchema(SQLAlchemyAutoSchema):
include_relationships = True
include_fk = True # Includes foreign keys
type = EnumField(FileType)
class LookupFileModel(db.Model):
"""Takes the content of a file (like a xlsx, or csv file) and creates a key/value
store that can be used for lookups and searches. This table contains the metadata,
so we know the version of the file that was used, and what key column, and value column
were used to generate this lookup table. ie, the same xls file might have multiple
lookup file models, if different keys and labels are used - or someone decides to
make a change. We need to handle full text search over the label and value columns,
and not every column, because we don't know how much information will be in there. """
__tablename__ = 'lookup_file'
id = db.Column(db.Integer, primary_key=True)
label_column = db.Column(db.String)
value_column = db.Column(db.String)
file_data_model_id = db.Column(db.Integer, db.ForeignKey('file_data.id'))
class LookupDataModel(db.Model):
__tablename__ = 'lookup_data'
id = db.Column(db.Integer, primary_key=True)
lookup_file_model_id = db.Column(db.Integer, db.ForeignKey('lookup_file.id'))
lookup_file_model = db.relationship(LookupFileModel)
value = db.Column(db.String)
label = db.Column(db.String)
# In the future, we might allow adding an additional "search" column if we want to search things not in label.
data = db.Column(db.JSON) # all data for the row is stored in a json structure here, but not searched presently.
# Assure there is a searchable index on the label column, so we can get fast results back.
# query with:
# search_results = LookupDataModel.query.filter(LookupDataModel.label.match("INTERNAL")).all()
__table_args__ = (
Index(
'ix_lookupdata_tsv',
func.to_tsvector('english', label),
postgresql_using='gin'
),
)

View File

@ -117,6 +117,7 @@ class Study(object):
self.categories = categories
self.warnings = []
@classmethod
def from_model(cls, study_model: StudyModel):
args = {k: v for k, v in study_model.__dict__.items() if not k.startswith('_')}

View File

@ -198,7 +198,7 @@ class FileService(object):
@staticmethod
def get_workflow_file_data(workflow, file_name):
"""Given a SPIFF Workflow Model, tracks down a file with the given name in the datbase and returns it's data"""
"""Given a SPIFF Workflow Model, tracks down a file with the given name in the database and returns it's data"""
workflow_spec_model = FileService.find_spec_model_in_db(workflow)
study_id = workflow.data[WorkflowProcessor.STUDY_ID_KEY]

View File

@ -1,6 +1,4 @@
from SpiffWorkflow.bpmn.specs.ManualTask import ManualTask
from SpiffWorkflow.bpmn.specs.MultiInstanceTask import MultiInstanceTask
from SpiffWorkflow.bpmn.specs.NoneTask import NoneTask
from SpiffWorkflow.bpmn.specs.ScriptTask import ScriptTask
from SpiffWorkflow.bpmn.specs.UserTask import UserTask
from SpiffWorkflow.bpmn.workflow import BpmnWorkflow
@ -8,11 +6,13 @@ from SpiffWorkflow.dmn.specs.BuisnessRuleTask import BusinessRuleTask
from SpiffWorkflow.specs import CancelTask, StartTask
from pandas import ExcelFile
from crc import db
from crc.api.common import ApiError
from crc.models.api_models import Task, MultiInstanceType
import jinja2
from jinja2 import Template
from crc.models.file import FileDataModel, LookupFileModel, LookupDataModel
from crc.services.file_service import FileService
from crc.services.workflow_processor import WorkflowProcessor, CustomBpmnScriptEngine
from SpiffWorkflow import Task as SpiffTask, WorkflowException
@ -141,10 +141,10 @@ class WorkflowService(object):
if not field.has_property(Task.EMUM_OPTIONS_VALUE_COL_PROP) or \
not field.has_property(Task.EMUM_OPTIONS_LABEL_COL_PROP):
raise ApiError.from_task("invalid_emum",
"For emumerations based on an xls file, you must include 3 properties: %s, "
"%s, and %s, you supplied %s" % (Task.ENUM_OPTIONS_FILE_PROP,
Task.EMUM_OPTIONS_VALUE_COL_PROP,
Task.EMUM_OPTIONS_LABEL_COL_PROP),
"For enumerations based on an xls file, you must include 3 properties: %s, "
"%s, and %s" % (Task.ENUM_OPTIONS_FILE_PROP,
Task.EMUM_OPTIONS_VALUE_COL_PROP,
Task.EMUM_OPTIONS_LABEL_COL_PROP),
task=spiff_task)
# Get the file data from the File Service
@ -152,15 +152,51 @@ class WorkflowService(object):
value_column = field.get_property(Task.EMUM_OPTIONS_VALUE_COL_PROP)
label_column = field.get_property(Task.EMUM_OPTIONS_LABEL_COL_PROP)
data_model = FileService.get_workflow_file_data(spiff_task.workflow, file_name)
lookup_model = WorkflowService.get_lookup_table(data_model, value_column, label_column)
# If lookup is set to true, do not populate options, a lookup will happen later.
if field.has_property(Task.EMUM_OPTIONS_AS_LOOKUP) and field.get_property(Task.EMUM_OPTIONS_AS_LOOKUP):
pass
else:
data = db.session.query(LookupDataModel).filter(LookupDataModel.lookup_file_model == lookup_model).all()
for d in data:
field.options.append({"id": d.value, "name": d.label})
@staticmethod
def get_lookup_table(data_model: FileDataModel, value_column, label_column):
""" In some cases the lookup table can be very large. This method will add all values to the database
in a way that can be searched and returned via an api call - rather than sending the full set of
options along with the form. It will only open the file and process the options if something has
changed. """
lookup_model = db.session.query(LookupFileModel) \
.filter(LookupFileModel.file_data_model_id == data_model.id) \
.filter(LookupFileModel.value_column == value_column) \
.filter(LookupFileModel.label_column == label_column).first()
if not lookup_model:
xls = ExcelFile(data_model.data)
df = xls.parse(xls.sheet_names[0])
df = xls.parse(xls.sheet_names[0]) # Currently we only look at the fist sheet.
if value_column not in df:
raise ApiError("invalid_emum",
"The file %s does not contain a column named % s" % (file_name, value_column))
"The file %s does not contain a column named % s" % (data_model.file_model.name,
value_column))
if label_column not in df:
raise ApiError("invalid_emum",
"The file %s does not contain a column named % s" % (file_name, label_column))
"The file %s does not contain a column named % s" % (data_model.file_model.name,
label_column))
lookup_model = LookupFileModel(label_column=label_column, value_column=value_column,
file_data_model_id=data_model.id)
db.session.add(lookup_model)
for index, row in df.iterrows():
field.options.append({"id": row[value_column],
"name": row[label_column]})
lookup_data = LookupDataModel(lookup_file_model=lookup_model,
value=row[value_column],
label=row[label_column],
data=row.to_json())
db.session.add(lookup_data)
db.session.commit()
return lookup_model

View File

@ -1,12 +1,15 @@
import json
import os
from sqlalchemy import func
from crc import session, app
from crc.models.api_models import WorkflowApiSchema, Task
from crc.models.file import FileModelSchema
from crc.models.file import FileModelSchema, FileDataModel, FileModel, LookupFileModel, LookupDataModel
from crc.models.stats import WorkflowStatsModel, TaskEventModel
from crc.models.study import StudyModel
from crc.models.workflow import WorkflowSpecModelSchema, WorkflowModel, WorkflowStatus
from crc.services.file_service import FileService
from crc.services.workflow_processor import WorkflowProcessor
from crc.services.workflow_service import WorkflowService
from tests.base_test import BaseTest
@ -75,5 +78,54 @@ class TestWorkflowService(BaseTest):
WorkflowService._process_options(task, task.task_spec.form.fields[0])
options = task.task_spec.form.fields[0].options
self.assertEquals(19, len(options))
self.assertEquals(1000, options[0]['id'])
self.assertEquals("UVA - INTERNAL - GM USE ONLY", options[0]['name'])
self.assertEquals('1000', options[0]['id'])
self.assertEquals("UVA - INTERNAL - GM USE ONLY", options[0]['name'])
def test_create_lookup_file(self):
spec = self.load_test_spec('enum_options_from_file')
file_model = session.query(FileModel).filter(FileModel.name == "customer_list.xls").first()
file_data_model = session.query(FileDataModel).filter(FileDataModel.file_model == file_model).first()
WorkflowService.get_lookup_table(file_data_model, "CUSTOMER_NUMBER", "CUSTOMER_NAME")
lookup_records = session.query(LookupFileModel).all()
self.assertIsNotNone(lookup_records)
self.assertEqual(1, len(lookup_records))
lookup_record = lookup_records[0]
self.assertIsNotNone(lookup_record)
self.assertEquals("CUSTOMER_NUMBER", lookup_record.value_column)
self.assertEquals("CUSTOMER_NAME", lookup_record.label_column)
self.assertEquals("CUSTOMER_NAME", lookup_record.label_column)
lookup_data = session.query(LookupDataModel).filter(LookupDataModel.lookup_file_model == lookup_record).all()
self.assertEquals(19, len(lookup_data))
self.assertEquals("1000", lookup_data[0].value)
self.assertEquals("UVA - INTERNAL - GM USE ONLY", lookup_data[0].label)
# search_results = session.query(LookupDataModel).\
# filter(LookupDataModel.lookup_file_model_id == lookup_record.id).\
# filter(LookupDataModel.__ts_vector__.op('@@')(func.plainto_tsquery('INTERNAL'))).all()
search_results = LookupDataModel.query.filter(LookupDataModel.label.match("INTERNAL")).all()
self.assertEquals(1, len(search_results))
search_results = LookupDataModel.query.filter(LookupDataModel.label.match("internal")).all()
self.assertEquals(1, len(search_results))
# This query finds results where a word starts with "bio"
search_results = LookupDataModel.query.filter(LookupDataModel.label.match("bio:*")).all()
self.assertEquals(2, len(search_results))
def test_create_lookup_file_multiple_times_does_not_update_database(self):
spec = self.load_test_spec('enum_options_from_file')
file_model = session.query(FileModel).filter(FileModel.name == "customer_list.xls").first()
file_data_model = session.query(FileDataModel).filter(FileDataModel.file_model == file_model).first()
WorkflowService.get_lookup_table(file_data_model, "CUSTOMER_NUMBER", "CUSTOMER_NAME")
WorkflowService.get_lookup_table(file_data_model, "CUSTOMER_NUMBER", "CUSTOMER_NAME")
WorkflowService.get_lookup_table(file_data_model, "CUSTOMER_NUMBER", "CUSTOMER_NAME")
lookup_records = session.query(LookupFileModel).all()
self.assertIsNotNone(lookup_records)
self.assertEqual(1, len(lookup_records))
lookup_record = lookup_records[0]
lookup_data = session.query(LookupDataModel).filter(LookupDataModel.lookup_file_model == lookup_record).all()
self.assertEquals(19, len(lookup_data))
# Using the same table with different lookup lable or value, does create additional records.
WorkflowService.get_lookup_table(file_data_model, "CUSTOMER_NAME", "CUSTOMER_NUMBER")
lookup_records = session.query(LookupFileModel).all()
self.assertIsNotNone(lookup_records)
self.assertEqual(2, len(lookup_records))