better overall search results for type ahead. Still dealing with stop words failing.

This commit is contained in:
Dan Funk 2020-04-23 12:05:08 -04:00
parent 65b29e1a9d
commit b5b46b7c2c
4 changed files with 74 additions and 15 deletions

View File

@ -201,15 +201,6 @@ def lookup(workflow_id, task_id, field_id, query, limit):
if not field: if not field:
raise ApiError("unknown_field", "No field named %s in task %s" % (task_id, spiff_task.task_spec.name)) raise ApiError("unknown_field", "No field named %s in task %s" % (task_id, spiff_task.task_spec.name))
lookup_model = WorkflowService.get_lookup_table(spiff_task, field) lookup_table = WorkflowService.get_lookup_table(spiff_task, field)
db_query = LookupDataModel.query.filter(LookupDataModel.lookup_file_model == lookup_model) lookup_data = WorkflowService.run_lookup_query(lookup_table.id, query, limit)
return LookupDataSchema(many=True).dump(lookup_data)
query = query.strip()
if(len(query) > 1):
if(' ' in query):
query = ':* ||'.join(query.split(' '))
db_query = db_query.filter(LookupDataModel.label.match("%s:*" % query)).limit(limit)
else:
db_query = db_query.filter(LookupDataModel.label.match("%s:*" % query)).limit(limit)
return LookupDataSchema(many=True).dump(db_query.all())

View File

@ -127,7 +127,7 @@ class LookupDataModel(db.Model):
__table_args__ = ( __table_args__ = (
Index( Index(
'ix_lookupdata_tsv', 'ix_lookupdata_tsv',
func.to_tsvector('english', label), func.to_tsvector('simple', label), # Use simple, not english to keep stop words in place.
postgresql_using='gin' postgresql_using='gin'
), ),
) )

View File

@ -5,6 +5,7 @@ from SpiffWorkflow.bpmn.workflow import BpmnWorkflow
from SpiffWorkflow.dmn.specs.BuisnessRuleTask import BusinessRuleTask from SpiffWorkflow.dmn.specs.BuisnessRuleTask import BusinessRuleTask
from SpiffWorkflow.specs import CancelTask, StartTask from SpiffWorkflow.specs import CancelTask, StartTask
from pandas import ExcelFile from pandas import ExcelFile
from sqlalchemy import func
from crc import db from crc import db
from crc.api.common import ApiError from crc.api.common import ApiError
@ -205,3 +206,24 @@ class WorkflowService(object):
db.session.commit() db.session.commit()
return lookup_model return lookup_model
@staticmethod
def run_lookup_query(lookup_file_id, query, limit):
db_query = LookupDataModel.query.filter(LookupDataModel.lookup_file_model_id == lookup_file_id)
query = query.strip()
if len(query) > 1:
if ' ' in query:
terms = query.split(' ')
query = ""
new_terms = []
for t in terms:
new_terms.append(t + ":*")
query = '|'.join(new_terms)
else:
query = "%s:*" % query
db_query = db_query.filter(LookupDataModel.label.match(query))
# db_query = db_query.filter(text("lookup_data.label @@ to_tsquery('simple', '%s')" % query))
return db_query.limit(limit).all()

View File

@ -129,5 +129,51 @@ class TestWorkflowService(BaseTest):
self.assertIsNotNone(lookup_records) self.assertIsNotNone(lookup_records)
self.assertEqual(2, len(lookup_records)) self.assertEqual(2, len(lookup_records))
def test_some_queries(self): def test_some_full_text_queries(self):
pass self.load_test_spec('enum_options_from_file')
file_model = session.query(FileModel).filter(FileModel.name == "customer_list.xls").first()
file_data_model = session.query(FileDataModel).filter(FileDataModel.file_model == file_model).first()
lookup_table = WorkflowService._get_lookup_table_from_data_model(file_data_model, "CUSTOMER_NUMBER", "CUSTOMER_NAME")
lookup_data = session.query(LookupDataModel).filter(LookupDataModel.lookup_file_model == lookup_table).all()
results = WorkflowService.run_lookup_query(lookup_table.id, "medicines", limit=10)
self.assertEquals(1, len(results), "words in the middle of label are detected.")
self.assertEquals("The Medicines Company", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "", limit=10)
self.assertEquals(10, len(results), "Blank queries return everything, to the limit")
results = WorkflowService.run_lookup_query(lookup_table.id, "UVA", limit=10)
self.assertEquals(1, len(results), "Beginning of label is found.")
self.assertEquals("UVA - INTERNAL - GM USE ONLY", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "uva", limit=10)
self.assertEquals(1, len(results), "case does not matter.")
self.assertEquals("UVA - INTERNAL - GM USE ONLY", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "medici", limit=10)
self.assertEquals(1, len(results), "partial words are picked up.")
self.assertEquals("The Medicines Company", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "Genetics Savings", limit=10)
self.assertEquals(1, len(results), "multiple terms are picked up..")
self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "Genetics Sav", limit=10)
self.assertEquals(1, len(results), "prefix queries still work with partial terms")
self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "Gen Sav", limit=10)
self.assertEquals(1, len(results), "prefix queries still work with ALL the partial terms")
self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)
results = WorkflowService.run_lookup_query(lookup_table.id, "Inc", limit=10)
self.assertEquals(7, len(results), "short terms get multiple correct results.")
self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)
# Fixme: Stop words are taken into account on the query side, and haven't found a fix yet.
#results = WorkflowService.run_lookup_query(lookup_table.id, "in", limit=10)
#self.assertEquals(7, len(results), "stop words are not removed.")
#self.assertEquals("Genetics Savings & Clone, Inc.", results[0].label)