cr-connect-workflow/crc/services/lookup_service.py

import logging
import re
from collections import OrderedDict

import pandas as pd
from pandas import ExcelFile, np
from sqlalchemy import desc
from sqlalchemy.sql.functions import GenericFunction

from crc import db
from crc.api.common import ApiError
from crc.models.api_models import Task
from crc.models.file import FileDataModel, LookupFileModel, LookupDataModel
from crc.models.workflow import WorkflowModel, WorkflowSpecDependencyFile
from crc.services.file_service import FileService
from crc.services.ldap_service import LdapService
from crc.services.workflow_processor import WorkflowProcessor


class TSRank(GenericFunction):
    package = 'full_text'
    name = 'ts_rank'


class LookupService(object):
    """Provides tools for doing lookups for auto-complete fields.
    This can currently take two forms:
    1) Lookup from spreadsheet data associated with a workflow specification.
       in which case we store the spreadsheet data in a lookup table with full
       text indexing enabled, and run searches against that table.
    2) Lookup from LDAP records.  In which case we call out to an external service
       to pull back detailed records and return them.

    I could imagine this growing to include other external services as tools to handle
    lookup fields.  I could also imagine using some sort of local cache so we don't
    unnecessarily pound on external services for repeat searches for the same records.
    """

    @staticmethod
    def get_lookup_model(spiff_task, field):
        workflow_id = spiff_task.workflow.data[WorkflowProcessor.WORKFLOW_ID_KEY]
        workflow = db.session.query(WorkflowModel).filter(WorkflowModel.id == workflow_id).first()
        return LookupService.__get_lookup_model(workflow, spiff_task.task_spec.name, field.id)

    @staticmethod
    def __get_lookup_model(workflow, task_spec_id, field_id):
        lookup_model = db.session.query(LookupFileModel) \
            .filter(LookupFileModel.workflow_spec_id == workflow.workflow_spec_id) \
            .filter(LookupFileModel.field_id == field_id) \
            .filter(LookupFileModel.task_spec_id == task_spec_id) \
            .order_by(desc(LookupFileModel.id)).first()

        # one more quick query, to see if the lookup file is still related to this workflow.
        # if not, we need to rebuild the lookup table.
        is_current = False
        if lookup_model:
            is_current = db.session.query(WorkflowSpecDependencyFile). \
                filter(WorkflowSpecDependencyFile.file_data_id == lookup_model.file_data_model_id).\
                filter(WorkflowSpecDependencyFile.workflow_id == workflow.id).count()

        if not is_current:
            # Very very very expensive, but we don't know need this till we do.
            logging.warning("!!!! Making a very expensive call to update the lookup models.")
            lookup_model = LookupService.create_lookup_model(workflow, task_spec_id, field_id)

        return lookup_model

    @staticmethod
    def lookup(workflow, task_spec_id, field_id, query, value=None, limit=10):

        lookup_model = LookupService.__get_lookup_model(workflow, task_spec_id, field_id)

        if lookup_model.is_ldap:
            return LookupService._run_ldap_query(query, limit)
        else:
            return LookupService._run_lookup_query(lookup_model, query, value, limit)

    @staticmethod
    def create_lookup_model(workflow_model, task_spec_id, field_id):
        """
        This is all really expensive, but should happen just once (per file change).

        Checks to see if the options are provided in a separate lookup table associated with the workflow, and if so,
        assures that data exists in the database, and return a model than can be used to locate that data.

        Returns:  an array of LookupData, suitable for returning to the API.
        """
        processor = WorkflowProcessor(workflow_model)  # VERY expensive, Ludicrous for lookup / type ahead
        spec, field = processor.find_spec_and_field(task_spec_id, field_id)

        # Clear out all existing lookup models for this workflow and field.
        existing_models = db.session.query(LookupFileModel) \
            .filter(LookupFileModel.workflow_spec_id == workflow_model.workflow_spec_id) \
            .filter(LookupFileModel.task_spec_id == task_spec_id) \
            .filter(LookupFileModel.field_id == field_id).all()
        for model in existing_models:  # Do it one at a time to cause the required cascade of deletes.
            db.session.delete(model)

        #  Use the contents of a file to populate enum field options
        if field.has_property(Task.FIELD_PROP_SPREADSHEET_NAME):
            if not (field.has_property(Task.FIELD_PROP_SPREADSHEET_VALUE_COLUMN) or
                    field.has_property(Task.FIELD_PROP_SPREADSHEET_LABEL_COLUMN)):
                raise ApiError.from_task_spec("invalid_enum",
                                         "For enumerations based on an xls file, you must include 3 properties: %s, "
                                         "%s, and %s" % (Task.FIELD_PROP_SPREADSHEET_NAME,
                                                         Task.FIELD_PROP_SPREADSHEET_VALUE_COLUMN,
                                                         Task.FIELD_PROP_SPREADSHEET_LABEL_COLUMN),
                                         task_spec=spec)

            # Get the file data from the File Service
            file_name = field.get_property(Task.FIELD_PROP_SPREADSHEET_NAME)
            value_column = field.get_property(Task.FIELD_PROP_SPREADSHEET_VALUE_COLUMN)
            label_column = field.get_property(Task.FIELD_PROP_SPREADSHEET_LABEL_COLUMN)
            latest_files = FileService.get_spec_data_files(workflow_spec_id=workflow_model.workflow_spec_id,
                                                           workflow_id=workflow_model.id,
                                                           name=file_name)
            if len(latest_files) < 1:
                raise ApiError("invalid_enum", "Unable to locate the lookup data file '%s'" % file_name)
            else:
                data_model = latest_files[0]

            lookup_model = LookupService.build_lookup_table(data_model, value_column, label_column,
                                                            workflow_model.workflow_spec_id, task_spec_id, field_id)

        #  Use the results of an LDAP request to populate enum field options
        elif field.has_property(Task.FIELD_PROP_LDAP_LOOKUP):
            lookup_model = LookupFileModel(workflow_spec_id=workflow_model.workflow_spec_id,
                                           field_id=field_id,
                                           is_ldap=True)

        else:
            raise ApiError.from_task_spec("unknown_lookup_option",
                           "Lookup supports using spreadsheet or LDAP options, "
                           "and neither of those was provided.", spec)
        db.session.add(lookup_model)
        db.session.commit()
        return lookup_model

    @staticmethod
    def build_lookup_table(data_model: FileDataModel, value_column, label_column, workflow_spec_id, task_spec_id, field_id):
        """ In some cases the lookup table can be very large.  This method will add all values to the database
         in a way that can be searched and returned via an api call - rather than sending the full set of
          options along with the form.  It will only open the file and process the options if something has
          changed.  """
        xls = ExcelFile(data_model.data)
        df = xls.parse(xls.sheet_names[0])  # Currently we only look at the fist sheet.
        df = pd.DataFrame(df).replace({np.nan: None})
        if value_column not in df:
            raise ApiError("invalid_enum",
                           "The file %s does not contain a column named % s" % (data_model.file_model.name,
                                                                                value_column))
        if label_column not in df:
            raise ApiError("invalid_enum",
                           "The file %s does not contain a column named % s" % (data_model.file_model.name,
                                                                                label_column))

        lookup_model = LookupFileModel(workflow_spec_id=workflow_spec_id,
                                       field_id=field_id,
                                       task_spec_id=task_spec_id,
                                       file_data_model_id=data_model.id,
                                       is_ldap=False)

        db.session.add(lookup_model)
        for index, row in df.iterrows():
            lookup_data = LookupDataModel(lookup_file_model=lookup_model,
                                          value=row[value_column],
                                          label=row[label_column],
                                          data=row.to_dict(OrderedDict))
            db.session.add(lookup_data)
        db.session.commit()
        return lookup_model

    @staticmethod
    def _run_lookup_query(lookup_file_model, query, value, limit):
        db_query = LookupDataModel.query.filter(LookupDataModel.lookup_file_model == lookup_file_model)
        if value is not None:  # Then just find the model with that value
            db_query = db_query.filter(LookupDataModel.value == value)
        else:
            # Build a full text query that takes all the terms provided and executes each term as a prefix query, and
            # OR's those queries together.  The order of the results is handled as a standard "Like" on the original
            # string which seems to work intuitively for most entries.
            query = re.sub('[^A-Za-z0-9 ]+', '', query)  # Strip out non ascii characters.
            query = re.sub(r'\s+', ' ', query)  # Convert multiple space like characters to just one space, as we split on spaces.
            print("Query: " + query)
            query = query.strip()
            if len(query) > 0:
                if ' ' in query:
                    terms = query.split(' ')
                    new_terms = []
                    for t in terms:
                        new_terms.append("%s:*" % t)
                    new_query = ' & '.join(new_terms)
                    new_query = "'%s' | %s" % (query, new_query)
                else:
                    new_query = "%s:*" % query

                db_query = db_query.filter(
                    LookupDataModel.__ts_vector__.match(new_query,  postgresql_regconfig='simple'))

                # Hackishly order by like, which does a good job of pulling more relevant matches to the top.
                db_query = db_query.order_by(desc(LookupDataModel.label.like("%" + query + "%")))

        logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
        logging.info(db_query)
        result = db_query.limit(limit).all()
        logging.getLogger('sqlalchemy.engine').setLevel(logging.ERROR)
        return result

    @staticmethod
    def _run_ldap_query(query, limit):
        users = LdapService.search_users(query, limit)

        """Converts the user models into something akin to the
        LookupModel in models/file.py, so this can be returned in the same way 
         we return a lookup data model."""
        user_list = []
        for user in users:
            user_list.append({"value": user['uid'],
                              "label": user['display_name'] + " (" + user['uid'] + ")",
                              "data": user
                              })
        return user_list