2020-05-27 14:36:10 -04:00
import logging
2022-02-18 10:41:24 -05:00
import math
2020-05-29 01:39:39 -04:00
import re
2020-06-30 12:24:48 -04:00
from collections import OrderedDict
2021-11-16 11:54:31 -05:00
from zipfile import BadZipFile
2020-05-27 14:36:10 -04:00
2020-06-30 10:00:22 -04:00
import pandas as pd
2021-06-18 16:41:55 -04:00
from pandas import ExcelFile
from pandas . _libs . missing import NA
2020-07-14 11:38:48 -04:00
from sqlalchemy import desc
2020-05-27 09:47:44 -04:00
from sqlalchemy . sql . functions import GenericFunction
2020-05-19 16:11:43 -04:00
from crc import db
from crc . api . common import ApiError
from crc . models . api_models import Task
2022-01-12 15:00:26 -05:00
from crc . models . file import LookupFileModel , LookupDataModel
2021-10-19 10:13:43 -04:00
from crc . models . ldap import LdapSchema
2022-01-19 16:12:54 -05:00
from crc . models . workflow import WorkflowModel
2022-01-07 15:34:51 -05:00
from crc . services . spec_file_service import SpecFileService
2022-01-12 14:37:33 -05:00
from crc . services . reference_file_service import ReferenceFileService
2020-05-19 16:11:43 -04:00
from crc . services . ldap_service import LdapService
2020-05-29 01:39:39 -04:00
from crc . services . workflow_processor import WorkflowProcessor
2022-02-09 11:07:09 -05:00
from crc . services . workflow_spec_service import WorkflowSpecService
2020-05-29 01:39:39 -04:00
2020-05-19 16:11:43 -04:00
2020-05-27 09:47:44 -04:00
class TSRank ( GenericFunction ) :
package = ' full_text '
name = ' ts_rank '
2020-05-19 16:11:43 -04:00
2020-06-30 11:12:28 -04:00
class LookupService ( object ) :
2021-07-06 13:10:20 -04:00
""" Provides tools for doing lookups for auto-complete fields, and rapid access to any
uploaded spreadsheets .
This can currently take three forms :
2020-05-19 16:11:43 -04:00
1 ) Lookup from spreadsheet data associated with a workflow specification .
in which case we store the spreadsheet data in a lookup table with full
text indexing enabled , and run searches against that table .
2021-07-06 13:10:20 -04:00
2 ) Lookup from spreadsheet data associated with a specific file . This allows us
to get a lookup model for a specific file object , such as a reference file .
2020-05-19 16:11:43 -04:00
2 ) Lookup from LDAP records . In which case we call out to an external service
to pull back detailed records and return them .
I could imagine this growing to include other external services as tools to handle
lookup fields . I could also imagine using some sort of local cache so we don ' t
unnecessarily pound on external services for repeat searches for the same records .
"""
@staticmethod
2020-05-29 01:39:39 -04:00
def get_lookup_model ( spiff_task , field ) :
workflow_id = spiff_task . workflow . data [ WorkflowProcessor . WORKFLOW_ID_KEY ]
workflow = db . session . query ( WorkflowModel ) . filter ( WorkflowModel . id == workflow_id ) . first ( )
2021-03-01 14:54:04 -05:00
return LookupService . __get_lookup_model ( workflow , spiff_task . task_spec . name , field . id )
2020-05-29 01:39:39 -04:00
2021-07-06 13:10:20 -04:00
@staticmethod
2022-02-02 12:59:56 -05:00
def get_lookup_model_for_reference ( file_name , value_column , label_column ) :
2022-02-17 11:59:48 -05:00
timestamp = ReferenceFileService ( ) . timestamp ( file_name )
2022-02-02 12:59:56 -05:00
lookup_model = db . session . query ( LookupFileModel ) . \
filter ( LookupFileModel . file_name == file_name ) . \
2022-02-09 23:29:39 -05:00
filter ( LookupFileModel . workflow_spec_id == None ) . \
2022-02-17 11:59:48 -05:00
filter ( LookupFileModel . file_timestamp == timestamp ) . \
2022-02-09 23:29:39 -05:00
first ( ) # use "==" not "is none" which does NOT work, and makes this constantly expensive.
2021-07-06 13:10:20 -04:00
if not lookup_model :
logging . warning ( " !!!! Making a very expensive call to update the lookup model. " )
2022-02-02 12:59:56 -05:00
file_data = ReferenceFileService ( ) . get_data ( file_name )
2022-02-17 11:59:48 -05:00
lookup_model = LookupService . build_lookup_table ( file_name , file_data , timestamp , value_column , label_column )
2021-07-06 13:10:20 -04:00
return lookup_model
2020-05-29 01:39:39 -04:00
@staticmethod
2021-03-01 14:54:04 -05:00
def __get_lookup_model ( workflow , task_spec_id , field_id ) :
2020-05-29 01:39:39 -04:00
lookup_model = db . session . query ( LookupFileModel ) \
. filter ( LookupFileModel . workflow_spec_id == workflow . workflow_spec_id ) \
2020-07-13 12:45:51 -04:00
. filter ( LookupFileModel . field_id == field_id ) \
2021-03-01 14:54:04 -05:00
. filter ( LookupFileModel . task_spec_id == task_spec_id ) \
2020-07-13 12:45:51 -04:00
. order_by ( desc ( LookupFileModel . id ) ) . first ( )
2020-05-29 01:39:39 -04:00
2022-01-20 13:05:58 -05:00
# The above may return a model, if it does, it might still be out of date.
# We need to check the file date to assure we have the most recent file.
2020-05-29 01:39:39 -04:00
is_current = False
if lookup_model :
2021-10-06 12:17:57 -04:00
if lookup_model . is_ldap : # LDAP is always current
is_current = True
2022-02-17 11:59:48 -05:00
elif lookup_model . file_name is not None and lookup_model . file_timestamp is not None :
2022-02-17 11:04:50 -05:00
# In some legacy cases, the lookup model might exist, but not have a file name, in which case we need
# to rebuild.
2022-02-09 12:11:51 -05:00
workflow_spec = WorkflowSpecService ( ) . get_spec ( workflow . workflow_spec_id )
2022-02-17 11:59:48 -05:00
timestamp = SpecFileService . timestamp ( workflow_spec , lookup_model . file_name )
2022-02-18 10:12:25 -05:00
# Assures we have the same timestamp, as storage in the database might create slight variations in
# the floating point values, just assure they values match to within a second.
2022-02-18 10:41:24 -05:00
is_current = int ( timestamp - lookup_model . file_timestamp ) == 0
2021-10-06 12:17:57 -04:00
2020-05-29 01:39:39 -04:00
if not is_current :
# Very very very expensive, but we don't know need this till we do.
2021-03-02 12:21:51 -05:00
logging . warning ( " !!!! Making a very expensive call to update the lookup models. " )
2021-03-01 14:54:04 -05:00
lookup_model = LookupService . create_lookup_model ( workflow , task_spec_id , field_id )
2020-05-29 01:39:39 -04:00
return lookup_model
@staticmethod
2021-03-01 14:54:04 -05:00
def lookup ( workflow , task_spec_id , field_id , query , value = None , limit = 10 ) :
2021-10-21 13:57:49 -04:00
# Returns a list of dictionaries
2021-03-01 14:54:04 -05:00
lookup_model = LookupService . __get_lookup_model ( workflow , task_spec_id , field_id )
2020-05-29 01:39:39 -04:00
if lookup_model . is_ldap :
2021-10-19 10:13:43 -04:00
return LookupService . _run_ldap_query ( query , value , limit )
2020-05-19 16:11:43 -04:00
else :
2020-06-30 10:34:16 -04:00
return LookupService . _run_lookup_query ( lookup_model , query , value , limit )
2020-05-29 01:39:39 -04:00
2021-10-19 10:13:43 -04:00
2020-05-19 16:11:43 -04:00
@staticmethod
2021-03-01 14:54:04 -05:00
def create_lookup_model ( workflow_model , task_spec_id , field_id ) :
2020-05-29 01:39:39 -04:00
"""
2020-07-10 14:48:38 -04:00
This is all really expensive , but should happen just once ( per file change ) .
Checks to see if the options are provided in a separate lookup table associated with the workflow , and if so ,
assures that data exists in the database , and return a model than can be used to locate that data .
Returns : an array of LookupData , suitable for returning to the API .
2020-05-19 16:11:43 -04:00
"""
2020-05-29 01:39:39 -04:00
processor = WorkflowProcessor ( workflow_model ) # VERY expensive, Ludicrous for lookup / type ahead
2021-03-01 14:54:04 -05:00
spec , field = processor . find_spec_and_field ( task_spec_id , field_id )
2020-05-29 01:39:39 -04:00
2020-07-13 12:45:51 -04:00
# Clear out all existing lookup models for this workflow and field.
existing_models = db . session . query ( LookupFileModel ) \
. filter ( LookupFileModel . workflow_spec_id == workflow_model . workflow_spec_id ) \
2021-03-01 14:54:04 -05:00
. filter ( LookupFileModel . task_spec_id == task_spec_id ) \
2020-07-13 12:45:51 -04:00
. filter ( LookupFileModel . field_id == field_id ) . all ( )
for model in existing_models : # Do it one at a time to cause the required cascade of deletes.
db . session . delete ( model )
2020-07-13 17:46:28 -04:00
# Use the contents of a file to populate enum field options
2020-08-27 14:00:14 -04:00
if field . has_property ( Task . FIELD_PROP_SPREADSHEET_NAME ) :
2021-10-19 10:13:43 -04:00
if not ( field . has_property ( Task . FIELD_PROP_VALUE_COLUMN ) or
field . has_property ( Task . FIELD_PROP_LABEL_COLUMN ) ) :
2020-08-27 14:00:14 -04:00
raise ApiError . from_task_spec ( " invalid_enum " ,
2020-05-19 16:11:43 -04:00
" For enumerations based on an xls file, you must include 3 properties: %s , "
2020-08-27 14:00:14 -04:00
" %s , and %s " % ( Task . FIELD_PROP_SPREADSHEET_NAME ,
2021-10-19 10:13:43 -04:00
Task . FIELD_PROP_VALUE_COLUMN ,
Task . FIELD_PROP_LABEL_COLUMN ) ,
2020-08-27 14:00:14 -04:00
task_spec = spec )
2020-05-19 16:11:43 -04:00
# Get the file data from the File Service
2020-08-27 14:00:14 -04:00
file_name = field . get_property ( Task . FIELD_PROP_SPREADSHEET_NAME )
2021-10-19 10:13:43 -04:00
value_column = field . get_property ( Task . FIELD_PROP_VALUE_COLUMN )
label_column = field . get_property ( Task . FIELD_PROP_LABEL_COLUMN )
2022-02-08 15:04:47 -05:00
# TODO: workflow_model does not have a workflow_spec. It has a workflow_spec_id
2022-02-09 11:07:09 -05:00
workflow_spec = WorkflowSpecService ( ) . get_spec ( workflow_model . workflow_spec_id )
latest_files = SpecFileService ( ) . get_files ( workflow_spec , file_name = file_name )
2020-05-29 01:39:39 -04:00
if len ( latest_files ) < 1 :
2020-06-03 15:03:22 -04:00
raise ApiError ( " invalid_enum " , " Unable to locate the lookup data file ' %s ' " % file_name )
2020-05-29 01:39:39 -04:00
else :
2022-01-25 16:10:54 -05:00
file = latest_files [ 0 ]
2020-05-29 01:39:39 -04:00
2022-02-02 12:59:56 -05:00
file_data = SpecFileService ( ) . get_data ( workflow_spec , file_name )
2022-02-17 11:59:48 -05:00
timestamp = SpecFileService . timestamp ( workflow_spec , file_name )
2022-01-20 13:05:58 -05:00
2022-02-17 11:59:48 -05:00
lookup_model = LookupService . build_lookup_table ( file_name , file_data , timestamp , value_column , label_column ,
2021-03-01 14:54:04 -05:00
workflow_model . workflow_spec_id , task_spec_id , field_id )
2020-05-29 01:39:39 -04:00
2020-07-13 17:46:28 -04:00
# Use the results of an LDAP request to populate enum field options
2020-08-27 14:00:14 -04:00
elif field . has_property ( Task . FIELD_PROP_LDAP_LOOKUP ) :
2020-05-29 01:39:39 -04:00
lookup_model = LookupFileModel ( workflow_spec_id = workflow_model . workflow_spec_id ,
2021-10-06 12:17:57 -04:00
task_spec_id = task_spec_id ,
2020-05-29 01:39:39 -04:00
field_id = field_id ,
is_ldap = True )
2020-09-01 15:58:50 -04:00
2020-05-29 01:39:39 -04:00
else :
2020-09-01 15:58:50 -04:00
raise ApiError . from_task_spec ( " unknown_lookup_option " ,
2020-07-14 11:38:48 -04:00
" Lookup supports using spreadsheet or LDAP options, "
2020-09-01 15:58:50 -04:00
" and neither of those was provided. " , spec )
2020-05-29 01:39:39 -04:00
db . session . add ( lookup_model )
db . session . commit ( )
return lookup_model
2020-05-19 16:11:43 -04:00
@staticmethod
2022-02-17 11:59:48 -05:00
def build_lookup_table ( file_name , file_data , timestamp , value_column , label_column ,
2021-07-06 13:10:20 -04:00
workflow_spec_id = None , task_spec_id = None , field_id = None ) :
2020-05-19 16:11:43 -04:00
""" In some cases the lookup table can be very large. This method will add all values to the database
in a way that can be searched and returned via an api call - rather than sending the full set of
options along with the form . It will only open the file and process the options if something has
changed . """
2021-11-16 11:54:31 -05:00
try :
2022-01-06 11:46:54 -05:00
xlsx = ExcelFile ( file_data , engine = ' openpyxl ' )
2021-11-16 11:54:31 -05:00
# Pandas--or at least openpyxl, cannot read old xls files.
2021-11-16 12:05:20 -05:00
# The error comes back as zipfile.BadZipFile because xlsx files are zipped xml files
except BadZipFile :
2021-11-16 11:54:31 -05:00
raise ApiError ( code = ' excel_error ' ,
2022-02-02 12:59:56 -05:00
message = f " Error opening excel file { file_name } . You may have an older .xls spreadsheet. (workflow_spec_id: { workflow_spec_id } , task_spec_id: { task_spec_id } , and field_id: { field_id } ) " )
2021-11-16 11:54:31 -05:00
df = xlsx . parse ( xlsx . sheet_names [ 0 ] ) # Currently we only look at the fist sheet.
2021-06-18 16:41:55 -04:00
df = df . convert_dtypes ( )
2022-02-01 10:42:18 -05:00
df = df . loc [ : , ~ df . columns . str . contains ( ' ^Unnamed ' ) ] # Drop unnamed columns.
2021-06-18 16:41:55 -04:00
df = pd . DataFrame ( df ) . dropna ( how = ' all ' ) # Drop null rows
2022-02-02 12:59:56 -05:00
for ( column_name , column_data ) in df . iteritems ( ) :
2022-02-01 10:42:18 -05:00
data_type = df . dtypes [ column_name ] . name
if data_type == ' string ' :
2022-02-02 12:59:56 -05:00
df [ column_name ] = df [ column_name ] . fillna ( ' ' )
else :
df [ column_name ] = df [ column_name ] . fillna ( 0 )
2020-05-29 01:39:39 -04:00
if value_column not in df :
2020-07-13 17:46:28 -04:00
raise ApiError ( " invalid_enum " ,
2022-01-07 15:34:51 -05:00
" The file %s does not contain a column named % s " % ( file_name ,
2020-05-29 01:39:39 -04:00
value_column ) )
if label_column not in df :
2020-07-13 17:46:28 -04:00
raise ApiError ( " invalid_enum " ,
2022-01-07 15:34:51 -05:00
" The file %s does not contain a column named % s " % ( file_name ,
2020-05-29 01:39:39 -04:00
label_column ) )
lookup_model = LookupFileModel ( workflow_spec_id = workflow_spec_id ,
field_id = field_id ,
2021-03-01 14:54:04 -05:00
task_spec_id = task_spec_id ,
2022-02-02 12:59:56 -05:00
file_name = file_name ,
2022-02-17 11:59:48 -05:00
file_timestamp = timestamp ,
2020-05-29 01:39:39 -04:00
is_ldap = False )
db . session . add ( lookup_model )
for index , row in df . iterrows ( ) :
lookup_data = LookupDataModel ( lookup_file_model = lookup_model ,
value = row [ value_column ] ,
label = row [ label_column ] ,
2020-06-30 12:24:48 -04:00
data = row . to_dict ( OrderedDict ) )
2020-05-29 01:39:39 -04:00
db . session . add ( lookup_data )
db . session . commit ( )
2020-05-19 16:11:43 -04:00
return lookup_model
@staticmethod
2020-06-30 10:34:16 -04:00
def _run_lookup_query ( lookup_file_model , query , value , limit ) :
2020-05-19 16:11:43 -04:00
db_query = LookupDataModel . query . filter ( LookupDataModel . lookup_file_model == lookup_file_model )
2020-06-30 10:34:16 -04:00
if value is not None : # Then just find the model with that value
2021-11-09 12:55:06 -05:00
db_query = db_query . filter ( LookupDataModel . value == str ( value ) )
2020-06-30 10:00:22 -04:00
else :
# Build a full text query that takes all the terms provided and executes each term as a prefix query, and
# OR's those queries together. The order of the results is handled as a standard "Like" on the original
# string which seems to work intuitively for most entries.
query = re . sub ( ' [^A-Za-z0-9 ]+ ' , ' ' , query ) # Strip out non ascii characters.
2020-06-30 12:24:48 -04:00
query = re . sub ( r ' \ s+ ' , ' ' , query ) # Convert multiple space like characters to just one space, as we split on spaces.
2020-06-30 10:00:22 -04:00
print ( " Query: " + query )
query = query . strip ( )
if len ( query ) > 0 :
if ' ' in query :
terms = query . split ( ' ' )
2020-08-13 18:13:41 -04:00
new_terms = [ ]
2020-06-30 10:00:22 -04:00
for t in terms :
new_terms . append ( " %s :* " % t )
2020-08-13 18:13:41 -04:00
new_query = ' & ' . join ( new_terms )
new_query = " ' %s ' | %s " % ( query , new_query )
2020-06-30 10:00:22 -04:00
else :
new_query = " %s :* " % query
2020-08-13 18:13:41 -04:00
db_query = db_query . filter (
LookupDataModel . __ts_vector__ . match ( new_query , postgresql_regconfig = ' simple ' ) )
# Hackishly order by like, which does a good job of pulling more relevant matches to the top.
2020-06-30 10:00:22 -04:00
db_query = db_query . order_by ( desc ( LookupDataModel . label . like ( " % " + query + " % " ) ) )
2020-05-19 16:11:43 -04:00
2020-05-27 14:36:10 -04:00
logging . getLogger ( ' sqlalchemy.engine ' ) . setLevel ( logging . INFO )
2020-08-13 18:13:41 -04:00
logging . info ( db_query )
2020-05-27 14:36:10 -04:00
result = db_query . limit ( limit ) . all ( )
logging . getLogger ( ' sqlalchemy.engine ' ) . setLevel ( logging . ERROR )
2021-10-19 10:13:43 -04:00
result_data = list ( map ( lambda lookup_item : lookup_item . data , result ) )
return result_data
2020-05-19 16:11:43 -04:00
@staticmethod
2021-10-19 10:13:43 -04:00
def _run_ldap_query ( query , value , limit ) :
if value :
return [ LdapSchema ( ) . dump ( LdapService . user_info ( value ) ) ]
else :
users = LdapService . search_users ( query , limit )
return users