Merge pull request #183 from sartography/bug/109_lookup_stopwords

Bug/109 lookup stopwords
This commit is contained in:
Dan Funk 2020-08-13 20:45:32 -04:00 committed by GitHub
commit ec2d1dcefb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 22 additions and 22 deletions

View File

@ -166,10 +166,12 @@ class LookupDataModel(db.Model):
# query with:
# search_results = LookupDataModel.query.filter(LookupDataModel.label.match("INTERNAL")).all()
__ts_vector__ = func.to_tsvector('simple', label)
__table_args__ = (
Index(
'ix_lookupdata_tsv',
func.to_tsvector('simple', label), # Use simple, not english to keep stop words in place.
__ts_vector__, # Use simple, not english to keep stop words in place.
postgresql_using='gin'
),
)

View File

@ -181,20 +181,22 @@ class LookupService(object):
if len(query) > 0:
if ' ' in query:
terms = query.split(' ')
new_terms = ["'%s'" % query]
new_terms = []
for t in terms:
new_terms.append("%s:*" % t)
new_query = ' | '.join(new_terms)
new_query = ' & '.join(new_terms)
new_query = "'%s' | %s" % (query, new_query)
else:
new_query = "%s:*" % query
# Run the full text query
db_query = db_query.filter(LookupDataModel.label.match(new_query))
# But hackishly order by like, which does a good job of
# pulling more relevant matches to the top.
db_query = db_query.filter(
LookupDataModel.__ts_vector__.match(new_query, postgresql_regconfig='simple'))
# Hackishly order by like, which does a good job of pulling more relevant matches to the top.
db_query = db_query.order_by(desc(LookupDataModel.label.like("%" + query + "%")))
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
logging.info(db_query)
result = db_query.limit(limit).all()
logging.getLogger('sqlalchemy.engine').setLevel(logging.ERROR)
return result

View File

@ -114,11 +114,11 @@ class TestLookupService(BaseTest):
self.assertEqual("Genetics Savings & Clone, Inc.", results[0].label)
results = LookupService.lookup(workflow, "AllTheNames", "reaction design", limit=10)
self.assertEqual(5, len(results), "all results come back for two terms.")
self.assertEqual(3, len(results), "all results come back for two terms.")
self.assertEqual("Reaction Design", results[0].label, "Exact matches come first.")
results = LookupService.lookup(workflow, "AllTheNames", "1 Something", limit=10)
self.assertEqual("1 Something", results[0].label, "Exact matches are prefered")
self.assertEqual("1 Something", results[0].label, "Exact matches are preferred")
results = LookupService.lookup(workflow, "AllTheNames", "1 (!-Something", limit=10)
self.assertEqual("1 Something", results[0].label, "special characters don't flake out")
@ -126,16 +126,12 @@ class TestLookupService(BaseTest):
results = LookupService.lookup(workflow, "AllTheNames", "1 Something", limit=10)
self.assertEqual("1 Something", results[0].label, "double spaces should not be an issue.")
results = LookupService.lookup(workflow, "AllTheNames", "in", limit=10)
self.assertEqual(10, len(results), "stop words are not removed.")
self.assertEqual("Genetics Savings & Clone, Inc.", results[0].label)
results = LookupService.lookup(workflow, "AllTheNames", "other", limit=10)
self.assertEqual("Other", results[0].label, "Can't find the word 'other', which is an english stop word")
# 1018 10000 Something Industry
# 1019 1000 Something Industry
# 1020 1 Something Industry
# 1021 10 Something Industry
# 1022 10000 Something Industry
# Fixme: Stop words are taken into account on the query side, and haven't found a fix yet.
#results = WorkflowService.run_lookup_query(lookup_table.id, "in", limit=10)
#self.assertEqual(7, len(results), "stop words are not removed.")
#self.assertEqual("Genetics Savings & Clone, Inc.", results[0].label)

View File

@ -75,9 +75,9 @@ class TestWorkflowService(BaseTest):
task = processor.next_task()
WorkflowService.process_options(task, task.task_spec.form.fields[0])
options = task.task_spec.form.fields[0].options
self.assertEqual(28, len(options))
self.assertEqual('1000', options[0]['id'])
self.assertEqual("UVA - INTERNAL - GM USE ONLY", options[0]['name'])
self.assertEqual(29, len(options))
self.assertEqual('0', options[0]['id'])
self.assertEqual("Other", options[0]['name'])
def test_random_data_populate_form_on_auto_complete(self):
self.load_example_data()