better overall search results for type ahead. Still dealing with stop words failing.

This commit is contained in:
Dan Funk 2020-04-23 12:05:08 -04:00
parent 65b29e1a9d
commit b5b46b7c2c
4 changed files with 74 additions and 15 deletions

View File

@ -201,15 +201,6 @@ def lookup(workflow_id, task_id, field_id, query, limit):
if not field:
raise ApiError("unknown_field", "No field named %s in task %s" % (task_id, spiff_task.task_spec.name))
lookup_model = WorkflowService.get_lookup_table(spiff_task, field)
db_query = LookupDataModel.query.filter(LookupDataModel.lookup_file_model == lookup_model)
query = query.strip()
if(len(query) > 1):
if(' ' in query):
query = ':* ||'.join(query.split(' '))
db_query = db_query.filter(LookupDataModel.label.match("%s:*" % query)).limit(limit)
else:
db_query = db_query.filter(LookupDataModel.label.match("%s:*" % query)).limit(limit)
return LookupDataSchema(many=True).dump(db_query.all())
lookup_table = WorkflowService.get_lookup_table(spiff_task, field)
lookup_data = WorkflowService.run_lookup_query(lookup_table.id, query, limit)
return LookupDataSchema(many=True).dump(lookup_data)

View File

@ -127,7 +127,7 @@ class LookupDataModel(db.Model):
__table_args__ = (
Index(
'ix_lookupdata_tsv',
func.to_tsvector('english', label),
func.to_tsvector('simple', label), # Use simple, not english to keep stop words in place.
postgresql_using='gin'
),
)

View File

@ -5,6 +5,7 @@ from SpiffWorkflow.bpmn.workflow import BpmnWorkflow
from SpiffWorkflow.dmn.specs.BuisnessRuleTask import BusinessRuleTask
from SpiffWorkflow.specs import CancelTask, StartTask
from pandas import ExcelFile
from sqlalchemy import func
from crc import db
from crc.api.common import ApiError
@ -205,3 +206,24 @@ class WorkflowService(object):
db.session.commit()
return lookup_model
@staticmethod
def run_lookup_query(lookup_file_id, query, limit):
db_query = LookupDataModel.query.filter(LookupDataModel.lookup_file_model_id == lookup_file_id)
query = query.strip()
if len(query) > 1:
if ' ' in query:
terms = query.split(' ')
query = ""
new_terms = []
for t in terms:
new_terms.append(t + ":*")
query = '|'.join(new_terms)
else:
query = "%s:*" % query
db_query = db_query.filter(LookupDataModel.label.match(query))
# db_query = db_query.filter(text("lookup_data.label @@ to_tsquery('simple', '%s')" % query))
return db_query.limit(limit).all()

View File

@ -129,5 +129,51 @@ class TestWorkflowService(BaseTest):
self.assertIsNotNone(lookup_records)
self.assertEqual(2, len(lookup_records))
def test_some_queries(self):
pass
def test_some_full_text_queries(self):
self.load_test_spec('enum_options_from_file')
file_model = session.query(FileModel).filter(FileModel.name == "customer_list.xls").first()
file_data_model = session.query(FileDataModel).filter(FileDataModel.file_model == file_model).first()
lookup_table = WorkflowService._get_lookup_table_from_data_model(file_data_model, "CUSTOMER_NUMBER", "CUSTOMER_NAME")
lookup_data = session.query(LookupDataModel).filter(LookupDataModel.lookup_file_model == lookup_table).all()
results = WorkflowService.run_lookup_query(lookup_table.id, "medicines", limit=10)
self.assertEquals(1, len(results), "words in the middle of label are detected.")
self.assertEquals("The Medicines Company", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "", limit=10)
self.assertEquals(10, len(results), "Blank queries return everything, to the limit")
results = WorkflowService.run_lookup_query(lookup_table.id, "UVA", limit=10)
self.assertEquals(1, len(results), "Beginning of label is found.")
self.assertEquals("UVA - INTERNAL - GM USE ONLY", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "uva", limit=10)
self.assertEquals(1, len(results), "case does not matter.")
self.assertEquals("UVA - INTERNAL - GM USE ONLY", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "medici", limit=10)
self.assertEquals(1, len(results), "partial words are picked up.")
self.assertEquals("The Medicines Company", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "Genetics Savings", limit=10)
self.assertEquals(1, len(results), "multiple terms are picked up..")
self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "Genetics Sav", limit=10)
self.assertEquals(1, len(results), "prefix queries still work with partial terms")
self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "Gen Sav", limit=10)
self.assertEquals(1, len(results), "prefix queries still work with ALL the partial terms")
self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "Inc", limit=10)
self.assertEquals(7, len(results), "short terms get multiple correct results.")
self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)
# Fixme: Stop words are taken into account on the query side, and haven't found a fix yet.
#results = WorkflowService.run_lookup_query(lookup_table.id, "in", limit=10)
#self.assertEquals(7, len(results), "stop words are not removed.")
#self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)