From 9a5c1d7cfb5efaa322ec541d1d4827c87eaa69a8 Mon Sep 17 00:00:00 2001 From: Dan Funk Date: Thu, 13 Aug 2020 18:13:41 -0400 Subject: [PATCH] I may have finally wrapped my head around full text search in python. Now properly using an index based on simple rather than english dictionary which has far fewer stop words and stemming processes and plays much better to the type ahead search we are trying to provide. Stop words are no longer excluded, so "other" is a valid search and gets a result. --- crc/models/file.py | 4 +++- crc/services/lookup_service.py | 14 ++++++++------ .../enum_options_from_file/customer_list.xls | Bin 109056 -> 109056 bytes tests/test_lookup_service.py | 6 +++++- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/crc/models/file.py b/crc/models/file.py index 8afed6cd..8693b7e5 100644 --- a/crc/models/file.py +++ b/crc/models/file.py @@ -166,10 +166,12 @@ class LookupDataModel(db.Model): # query with: # search_results = LookupDataModel.query.filter(LookupDataModel.label.match("INTERNAL")).all() + __ts_vector__ = func.to_tsvector('simple', label) + __table_args__ = ( Index( 'ix_lookupdata_tsv', - func.to_tsvector('simple', label), # Use simple, not english to keep stop words in place. + __ts_vector__, # Use simple, not english to keep stop words in place. postgresql_using='gin' ), ) diff --git a/crc/services/lookup_service.py b/crc/services/lookup_service.py index c9eb1dd8..cfe00615 100644 --- a/crc/services/lookup_service.py +++ b/crc/services/lookup_service.py @@ -181,20 +181,22 @@ class LookupService(object): if len(query) > 0: if ' ' in query: terms = query.split(' ') - new_terms = ["'%s'" % query] + new_terms = [] for t in terms: new_terms.append("%s:*" % t) - new_query = ' | '.join(new_terms) + new_query = ' & '.join(new_terms) + new_query = "'%s' | %s" % (query, new_query) else: new_query = "%s:*" % query - # Run the full text query - db_query = db_query.filter(LookupDataModel.label.match(new_query)) - # But hackishly order by like, which does a good job of - # pulling more relevant matches to the top. + db_query = db_query.filter( + LookupDataModel.__ts_vector__.match(new_query, postgresql_regconfig='simple')) + + # Hackishly order by like, which does a good job of pulling more relevant matches to the top. db_query = db_query.order_by(desc(LookupDataModel.label.like("%" + query + "%"))) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) + logging.info(db_query) result = db_query.limit(limit).all() logging.getLogger('sqlalchemy.engine').setLevel(logging.ERROR) return result diff --git a/tests/data/enum_options_from_file/customer_list.xls b/tests/data/enum_options_from_file/customer_list.xls index d697bb67bb6e647f2ddf4f861ada04a5f3c56753..1ed72dd76f63922b12865e15f1cf18b33da29147 100644 GIT binary patch delta 1108 zcmY+DKWGzS7{=fCyY%w6m%Ao@NlcD8)f=bx?G0Xb7c?nnH9b=x~&e zRt;hyh=YiM1_h-$I28YYmJSXo4i1V|I&=sKe(#$v$sIQw?{m-dd)~{JYi!t!4g1N- zB=-DM$@ys1ssg~I|1NnnHU=d$&k=bXC-{X?$fH@$Cu-E%wjiRM;Ypcsn@7xIhqnCZF^ zY%Z#)TNpuK-B`jCLV#Ld!hP2tqyhdxLN_AhAfm}AY&{9c!!?lr-3=1j#$~ZfzYu3E z?aG0zCvs9&!Xx91urq3$k&u(}5+0LN%`9W?$N)ii2~Q2eW<=Z|XsJ;dfhDY}SIbyT z58^QvpQ~S595N{{;iX~5nVB-oI5WrfL@$tF=WBKBPJo$Y@r}By#UYdO64niq4loC< z*64>!h4v)2g!k%2Fis~Z;iHK=9G@_8CluEf?c7w}-M~uJB;m8FYH_%d$Kn^m^q9HJ zFuf3ynk0Nzw}Z?Ki$Bz}AT!J2Ps7YIGjEvL5R;lD{88ig0xL-tw@qAfe7A|qP&~)Z z9d%yEho?zR5_)PO$SklpSgF9wvSAiNOlqQ)z6O~^7PnQrg~hXjzfFn79kXYNnH88`EdVM`oP-Ci4Q>puWHr6&LY delta 1077 zcmY+DKWI}?6vofFFG=%1FNwKHlMo^ZB5u;bf0QbA76&I6Cvg$P$%3sZIO-s;^-}+| zVj+Wq76T0kN^x)yDk!0%gNQ?i4lPvZ5S4n)$xZSe86M}m-}n2@yGa_INTU;3IhVj) zZ!$4Evae18K;2tUjJNX1HHGz7C#~?z)qDmNolHX#kUsYg4`)Rue@BP21Axg`Zw8g7 z6hO@@x~IG=Zn|~Zt)bVP?e-NiCgd&6p*_#wiVVY)C9HX;Uc*as=^<85^yB>0 z@Y=@15|7$=I1p#3hE?;5;+cMYM8r4d8;Q63BIh-{v&^V46P6hjW|AgeJ|@oh))^CL z$~t3#_P97dSZ7?Ee9L?xd_nY@JTjE4G<-C(k9>=AqlQoB1&M=~=PC_fEHow1ENKi) z1?ritVZ-eEnUo{(yE*dM$8<&fVVSNl4_KxfU~-d&UuN1L&xyEc<2i|I8_xyed2w!; z&lDegAKav2+x+n}3nK1XWjRHu5GUd+N-^|xxa`v(!3`9Gob_& zQ6{^DJ7!`D&#pejYfj=AzyuvPPEUR{H