mirror of
https://github.com/sartography/cr-connect-workflow.git
synced 2025-02-23 13:18:35 +00:00
The Jinja2 service was treating all the task data as a possible template, modified it to only include the referenced templates. (This turned out not to be the problem, but it seems like a good idea to keep it in place) There is a terrible bug with the wordwrap pipe that will die without any details if you pass it a value of None. We now cature the terrible error, and replace it with a sensible one. Removed an unused Jinja_extensions file.
268 lines
14 KiB
Python
268 lines
14 KiB
Python
import logging
|
|
import math
|
|
import re
|
|
from collections import OrderedDict
|
|
from zipfile import BadZipFile
|
|
|
|
import pandas as pd
|
|
from pandas import ExcelFile
|
|
from pandas._libs.missing import NA
|
|
from sqlalchemy import desc
|
|
from sqlalchemy.sql.functions import GenericFunction
|
|
|
|
from crc import db
|
|
from crc.api.common import ApiError
|
|
from crc.models.api_models import Task
|
|
from crc.models.file import LookupFileModel, LookupDataModel
|
|
from crc.models.ldap import LdapSchema
|
|
from crc.models.workflow import WorkflowModel
|
|
from crc.services.spec_file_service import SpecFileService
|
|
from crc.services.reference_file_service import ReferenceFileService
|
|
from crc.services.ldap_service import LdapService
|
|
from crc.services.workflow_processor import WorkflowProcessor
|
|
from crc.services.workflow_spec_service import WorkflowSpecService
|
|
|
|
|
|
class TSRank(GenericFunction):
|
|
package = 'full_text'
|
|
name = 'ts_rank'
|
|
|
|
|
|
class LookupService(object):
|
|
"""Provides tools for doing lookups for auto-complete fields, and rapid access to any
|
|
uploaded spreadsheets.
|
|
This can currently take three forms:
|
|
1) Lookup from spreadsheet data associated with a workflow specification.
|
|
in which case we store the spreadsheet data in a lookup table with full
|
|
text indexing enabled, and run searches against that table.
|
|
2) Lookup from spreadsheet data associated with a specific file. This allows us
|
|
to get a lookup model for a specific file object, such as a reference file.
|
|
2) Lookup from LDAP records. In which case we call out to an external service
|
|
to pull back detailed records and return them.
|
|
|
|
I could imagine this growing to include other external services as tools to handle
|
|
lookup fields. I could also imagine using some sort of local cache so we don't
|
|
unnecessarily pound on external services for repeat searches for the same records.
|
|
"""
|
|
|
|
@staticmethod
|
|
def get_lookup_model(spiff_task, field):
|
|
workflow_id = spiff_task.workflow.data[WorkflowProcessor.WORKFLOW_ID_KEY]
|
|
workflow = db.session.query(WorkflowModel).filter(WorkflowModel.id == workflow_id).first()
|
|
return LookupService.__get_lookup_model(workflow, spiff_task.task_spec.name, field.id)
|
|
|
|
@staticmethod
|
|
def get_lookup_model_for_reference(file_name, value_column, label_column):
|
|
timestamp = ReferenceFileService().timestamp(file_name)
|
|
lookup_model = db.session.query(LookupFileModel).\
|
|
filter(LookupFileModel.file_name == file_name). \
|
|
filter(LookupFileModel.workflow_spec_id == None).\
|
|
filter(LookupFileModel.file_timestamp == timestamp).\
|
|
first() # use "==" not "is none" which does NOT work, and makes this constantly expensive.
|
|
if not lookup_model:
|
|
logging.warning("!!!! Making a very expensive call to update the lookup model.")
|
|
file_data = ReferenceFileService().get_data(file_name)
|
|
lookup_model = LookupService.build_lookup_table(file_name, file_data, timestamp, value_column, label_column)
|
|
return lookup_model
|
|
|
|
@staticmethod
|
|
def __get_lookup_model(workflow, task_spec_id, field_id):
|
|
lookup_model = db.session.query(LookupFileModel) \
|
|
.filter(LookupFileModel.workflow_spec_id == workflow.workflow_spec_id) \
|
|
.filter(LookupFileModel.field_id == field_id) \
|
|
.filter(LookupFileModel.task_spec_id == task_spec_id) \
|
|
.order_by(desc(LookupFileModel.id)).first()
|
|
|
|
# The above may return a model, if it does, it might still be out of date.
|
|
# We need to check the file date to assure we have the most recent file.
|
|
is_current = False
|
|
if lookup_model:
|
|
if lookup_model.is_ldap: # LDAP is always current
|
|
is_current = True
|
|
elif lookup_model.file_name is not None and lookup_model.file_timestamp is not None:
|
|
# In some legacy cases, the lookup model might exist, but not have a file name, in which case we need
|
|
# to rebuild.
|
|
workflow_spec = WorkflowSpecService().get_spec(workflow.workflow_spec_id)
|
|
timestamp = SpecFileService.timestamp(workflow_spec, lookup_model.file_name)
|
|
# Assures we have the same timestamp, as storage in the database might create slight variations in
|
|
# the floating point values, just assure they values match to within a second.
|
|
is_current = int(timestamp - lookup_model.file_timestamp) == 0
|
|
|
|
if not is_current:
|
|
# Very very very expensive, but we don't know need this till we do.
|
|
logging.warning("!!!! Making a very expensive call to update the lookup models.")
|
|
lookup_model = LookupService.create_lookup_model(workflow, task_spec_id, field_id)
|
|
|
|
return lookup_model
|
|
|
|
@staticmethod
|
|
def lookup(workflow, task_spec_id, field_id, query, value=None, limit=10):
|
|
# Returns a list of dictionaries
|
|
lookup_model = LookupService.__get_lookup_model(workflow, task_spec_id, field_id)
|
|
|
|
if lookup_model.is_ldap:
|
|
return LookupService._run_ldap_query(query, value, limit)
|
|
else:
|
|
return LookupService._run_lookup_query(lookup_model, query, value, limit)
|
|
|
|
|
|
@staticmethod
|
|
def create_lookup_model(workflow_model, task_spec_id, field_id):
|
|
"""
|
|
This is all really expensive, but should happen just once (per file change).
|
|
|
|
Checks to see if the options are provided in a separate lookup table associated with the workflow, and if so,
|
|
assures that data exists in the database, and return a model than can be used to locate that data.
|
|
|
|
Returns: an array of LookupData, suitable for returning to the API.
|
|
"""
|
|
processor = WorkflowProcessor(workflow_model) # VERY expensive, Ludicrous for lookup / type ahead
|
|
spec, field = processor.find_spec_and_field(task_spec_id, field_id)
|
|
|
|
# Clear out all existing lookup models for this workflow and field.
|
|
existing_models = db.session.query(LookupFileModel) \
|
|
.filter(LookupFileModel.workflow_spec_id == workflow_model.workflow_spec_id) \
|
|
.filter(LookupFileModel.task_spec_id == task_spec_id) \
|
|
.filter(LookupFileModel.field_id == field_id).all()
|
|
for model in existing_models: # Do it one at a time to cause the required cascade of deletes.
|
|
db.session.delete(model)
|
|
|
|
# Use the contents of a file to populate enum field options
|
|
if field.has_property(Task.FIELD_PROP_SPREADSHEET_NAME):
|
|
if not (field.has_property(Task.FIELD_PROP_VALUE_COLUMN) or
|
|
field.has_property(Task.FIELD_PROP_LABEL_COLUMN)):
|
|
raise ApiError.from_task_spec("invalid_enum",
|
|
"For enumerations based on an xls file, you must include 3 properties: %s, "
|
|
"%s, and %s" % (Task.FIELD_PROP_SPREADSHEET_NAME,
|
|
Task.FIELD_PROP_VALUE_COLUMN,
|
|
Task.FIELD_PROP_LABEL_COLUMN),
|
|
task_spec=spec)
|
|
|
|
# Get the file data from the File Service
|
|
file_name = field.get_property(Task.FIELD_PROP_SPREADSHEET_NAME)
|
|
value_column = field.get_property(Task.FIELD_PROP_VALUE_COLUMN)
|
|
label_column = field.get_property(Task.FIELD_PROP_LABEL_COLUMN)
|
|
# TODO: workflow_model does not have a workflow_spec. It has a workflow_spec_id
|
|
workflow_spec = WorkflowSpecService().get_spec(workflow_model.workflow_spec_id)
|
|
latest_files = SpecFileService().get_files(workflow_spec, file_name=file_name)
|
|
if len(latest_files) < 1:
|
|
raise ApiError("invalid_enum", "Unable to locate the lookup data file '%s'" % file_name)
|
|
else:
|
|
file = latest_files[0]
|
|
|
|
file_data = SpecFileService().get_data(workflow_spec, file_name)
|
|
timestamp = SpecFileService.timestamp(workflow_spec, file_name)
|
|
|
|
lookup_model = LookupService.build_lookup_table(file_name, file_data, timestamp, value_column, label_column,
|
|
workflow_model.workflow_spec_id, task_spec_id, field_id)
|
|
|
|
# Use the results of an LDAP request to populate enum field options
|
|
elif field.has_property(Task.FIELD_PROP_LDAP_LOOKUP):
|
|
lookup_model = LookupFileModel(workflow_spec_id=workflow_model.workflow_spec_id,
|
|
task_spec_id=task_spec_id,
|
|
field_id=field_id,
|
|
is_ldap=True)
|
|
|
|
else:
|
|
raise ApiError.from_task_spec("unknown_lookup_option",
|
|
"Lookup supports using spreadsheet or LDAP options, "
|
|
"and neither of those was provided.", spec)
|
|
db.session.add(lookup_model)
|
|
db.session.commit()
|
|
return lookup_model
|
|
|
|
@staticmethod
|
|
def build_lookup_table(file_name, file_data, timestamp, value_column, label_column,
|
|
workflow_spec_id=None, task_spec_id=None, field_id=None):
|
|
""" In some cases the lookup table can be very large. This method will add all values to the database
|
|
in a way that can be searched and returned via an api call - rather than sending the full set of
|
|
options along with the form. It will only open the file and process the options if something has
|
|
changed. """
|
|
try:
|
|
xlsx = ExcelFile(file_data, engine='openpyxl')
|
|
# Pandas--or at least openpyxl, cannot read old xls files.
|
|
# The error comes back as zipfile.BadZipFile because xlsx files are zipped xml files
|
|
except BadZipFile:
|
|
raise ApiError(code='excel_error',
|
|
message=f"Error opening excel file {file_name}. You may have an older .xls spreadsheet. (workflow_spec_id: {workflow_spec_id}, task_spec_id: {task_spec_id}, and field_id: {field_id})")
|
|
df = xlsx.parse(xlsx.sheet_names[0]) # Currently we only look at the fist sheet.
|
|
df = df.convert_dtypes()
|
|
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # Drop unnamed columns.
|
|
df = pd.DataFrame(df).dropna(how='all') # Drop null rows
|
|
for (column_name, column_data) in df.iteritems():
|
|
data_type = df.dtypes[column_name].name
|
|
if data_type == 'string':
|
|
df[column_name] = df[column_name].fillna('')
|
|
else:
|
|
df[column_name] = df[column_name].fillna(0)
|
|
if value_column not in df:
|
|
raise ApiError("invalid_enum",
|
|
"The file %s does not contain a column named % s" % (file_name,
|
|
value_column))
|
|
if label_column not in df:
|
|
raise ApiError("invalid_enum",
|
|
"The file %s does not contain a column named % s" % (file_name,
|
|
label_column))
|
|
|
|
lookup_model = LookupFileModel(workflow_spec_id=workflow_spec_id,
|
|
field_id=field_id,
|
|
task_spec_id=task_spec_id,
|
|
file_name=file_name,
|
|
file_timestamp=timestamp,
|
|
is_ldap=False)
|
|
|
|
db.session.add(lookup_model)
|
|
for index, row in df.iterrows():
|
|
lookup_data = LookupDataModel(lookup_file_model=lookup_model,
|
|
value=row[value_column],
|
|
label=row[label_column],
|
|
data=row.to_dict(OrderedDict))
|
|
db.session.add(lookup_data)
|
|
db.session.commit()
|
|
return lookup_model
|
|
|
|
@staticmethod
|
|
def _run_lookup_query(lookup_file_model, query, value, limit):
|
|
db_query = LookupDataModel.query.filter(LookupDataModel.lookup_file_model == lookup_file_model)
|
|
if value is not None: # Then just find the model with that value
|
|
db_query = db_query.filter(LookupDataModel.value == str(value))
|
|
else:
|
|
# Build a full text query that takes all the terms provided and executes each term as a prefix query, and
|
|
# OR's those queries together. The order of the results is handled as a standard "Like" on the original
|
|
# string which seems to work intuitively for most entries.
|
|
query = re.sub('[^A-Za-z0-9 ]+', '', query) # Strip out non ascii characters.
|
|
query = re.sub(r'\s+', ' ', query) # Convert multiple space like characters to just one space, as we split on spaces.
|
|
print("Query: " + query)
|
|
query = query.strip()
|
|
if len(query) > 0:
|
|
if ' ' in query:
|
|
terms = query.split(' ')
|
|
new_terms = []
|
|
for t in terms:
|
|
new_terms.append("%s:*" % t)
|
|
new_query = ' & '.join(new_terms)
|
|
new_query = "'%s' | %s" % (query, new_query)
|
|
else:
|
|
new_query = "%s:*" % query
|
|
|
|
db_query = db_query.filter(
|
|
LookupDataModel.__ts_vector__.match(new_query, postgresql_regconfig='simple'))
|
|
|
|
# Hackishly order by like, which does a good job of pulling more relevant matches to the top.
|
|
db_query = db_query.order_by(desc(LookupDataModel.label.like("%" + query + "%")))
|
|
|
|
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
|
logging.info(db_query)
|
|
result = db_query.limit(limit).all()
|
|
logging.getLogger('sqlalchemy.engine').setLevel(logging.ERROR)
|
|
result_data = list(map(lambda lookup_item: lookup_item.data, result))
|
|
return result_data
|
|
|
|
@staticmethod
|
|
def _run_ldap_query(query, value, limit):
|
|
if value:
|
|
return [LdapSchema().dump(LdapService.user_info(value))]
|
|
else:
|
|
users = LdapService.search_users(query, limit)
|
|
return users
|