From b5b46b7c2c4f7d71b1b7c8add0bfd7ca9f6189a9 Mon Sep 17 00:00:00 2001 From: Dan Funk Date: Thu, 23 Apr 2020 12:05:08 -0400 Subject: [PATCH] better overall search results for type ahead. Still dealing with stop words failing. --- crc/api/workflow.py | 15 ++-------- crc/models/file.py | 2 +- crc/services/workflow_service.py | 22 ++++++++++++++ tests/test_workflow_service.py | 50 ++++++++++++++++++++++++++++++-- 4 files changed, 74 insertions(+), 15 deletions(-) diff --git a/crc/api/workflow.py b/crc/api/workflow.py index 7002e479..7f661e38 100644 --- a/crc/api/workflow.py +++ b/crc/api/workflow.py @@ -201,15 +201,6 @@ def lookup(workflow_id, task_id, field_id, query, limit): if not field: raise ApiError("unknown_field", "No field named %s in task %s" % (task_id, spiff_task.task_spec.name)) - lookup_model = WorkflowService.get_lookup_table(spiff_task, field) - db_query = LookupDataModel.query.filter(LookupDataModel.lookup_file_model == lookup_model) - - query = query.strip() - if(len(query) > 1): - if(' ' in query): - query = ':* ||'.join(query.split(' ')) - db_query = db_query.filter(LookupDataModel.label.match("%s:*" % query)).limit(limit) - else: - db_query = db_query.filter(LookupDataModel.label.match("%s:*" % query)).limit(limit) - - return LookupDataSchema(many=True).dump(db_query.all()) \ No newline at end of file + lookup_table = WorkflowService.get_lookup_table(spiff_task, field) + lookup_data = WorkflowService.run_lookup_query(lookup_table.id, query, limit) + return LookupDataSchema(many=True).dump(lookup_data) \ No newline at end of file diff --git a/crc/models/file.py b/crc/models/file.py index 2472ad73..c2c2b045 100644 --- a/crc/models/file.py +++ b/crc/models/file.py @@ -127,7 +127,7 @@ class LookupDataModel(db.Model): __table_args__ = ( Index( 'ix_lookupdata_tsv', - func.to_tsvector('english', label), + func.to_tsvector('simple', label), # Use simple, not english to keep stop words in place. postgresql_using='gin' ), ) diff --git a/crc/services/workflow_service.py b/crc/services/workflow_service.py index f1715c18..c11b4807 100644 --- a/crc/services/workflow_service.py +++ b/crc/services/workflow_service.py @@ -5,6 +5,7 @@ from SpiffWorkflow.bpmn.workflow import BpmnWorkflow from SpiffWorkflow.dmn.specs.BuisnessRuleTask import BusinessRuleTask from SpiffWorkflow.specs import CancelTask, StartTask from pandas import ExcelFile +from sqlalchemy import func from crc import db from crc.api.common import ApiError @@ -205,3 +206,24 @@ class WorkflowService(object): db.session.commit() return lookup_model + + @staticmethod + def run_lookup_query(lookup_file_id, query, limit): + db_query = LookupDataModel.query.filter(LookupDataModel.lookup_file_model_id == lookup_file_id) + + query = query.strip() + if len(query) > 1: + if ' ' in query: + terms = query.split(' ') + query = "" + new_terms = [] + for t in terms: + new_terms.append(t + ":*") + query = '|'.join(new_terms) + else: + query = "%s:*" % query + db_query = db_query.filter(LookupDataModel.label.match(query)) + +# db_query = db_query.filter(text("lookup_data.label @@ to_tsquery('simple', '%s')" % query)) + + return db_query.limit(limit).all() \ No newline at end of file diff --git a/tests/test_workflow_service.py b/tests/test_workflow_service.py index 0b2daf69..c911fb8f 100644 --- a/tests/test_workflow_service.py +++ b/tests/test_workflow_service.py @@ -129,5 +129,51 @@ class TestWorkflowService(BaseTest): self.assertIsNotNone(lookup_records) self.assertEqual(2, len(lookup_records)) - def test_some_queries(self): - pass \ No newline at end of file + def test_some_full_text_queries(self): + self.load_test_spec('enum_options_from_file') + file_model = session.query(FileModel).filter(FileModel.name == "customer_list.xls").first() + file_data_model = session.query(FileDataModel).filter(FileDataModel.file_model == file_model).first() + lookup_table = WorkflowService._get_lookup_table_from_data_model(file_data_model, "CUSTOMER_NUMBER", "CUSTOMER_NAME") + lookup_data = session.query(LookupDataModel).filter(LookupDataModel.lookup_file_model == lookup_table).all() + + results = WorkflowService.run_lookup_query(lookup_table.id, "medicines", limit=10) + self.assertEquals(1, len(results), "words in the middle of label are detected.") + self.assertEquals("The Medicines Company", results[0].label) + + results = WorkflowService.run_lookup_query(lookup_table.id, "", limit=10) + self.assertEquals(10, len(results), "Blank queries return everything, to the limit") + + results = WorkflowService.run_lookup_query(lookup_table.id, "UVA", limit=10) + self.assertEquals(1, len(results), "Beginning of label is found.") + self.assertEquals("UVA - INTERNAL - GM USE ONLY", results[0].label) + + results = WorkflowService.run_lookup_query(lookup_table.id, "uva", limit=10) + self.assertEquals(1, len(results), "case does not matter.") + self.assertEquals("UVA - INTERNAL - GM USE ONLY", results[0].label) + + + + results = WorkflowService.run_lookup_query(lookup_table.id, "medici", limit=10) + self.assertEquals(1, len(results), "partial words are picked up.") + self.assertEquals("The Medicines Company", results[0].label) + + results = WorkflowService.run_lookup_query(lookup_table.id, "Genetics Savings", limit=10) + self.assertEquals(1, len(results), "multiple terms are picked up..") + self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label) + + results = WorkflowService.run_lookup_query(lookup_table.id, "Genetics Sav", limit=10) + self.assertEquals(1, len(results), "prefix queries still work with partial terms") + self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label) + + results = WorkflowService.run_lookup_query(lookup_table.id, "Gen Sav", limit=10) + self.assertEquals(1, len(results), "prefix queries still work with ALL the partial terms") + self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label) + + results = WorkflowService.run_lookup_query(lookup_table.id, "Inc", limit=10) + self.assertEquals(7, len(results), "short terms get multiple correct results.") + self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label) + + # Fixme: Stop words are taken into account on the query side, and haven't found a fix yet. + #results = WorkflowService.run_lookup_query(lookup_table.id, "in", limit=10) + #self.assertEquals(7, len(results), "stop words are not removed.") + #self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)