Added estimators
This commit is contained in:
parent
b767b94fbe
commit
e06ec76450
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,122 @@
|
||||||
|
import spread
|
||||||
|
import math
|
||||||
|
import random
|
||||||
|
|
||||||
|
o = spread.declutter(spread.load('diff_and_price.csv'))
|
||||||
|
|
||||||
|
diffs = [float(q[2]) for q in o][::-1]
|
||||||
|
prices = [float(q[1]) for q in o][::-1]
|
||||||
|
|
||||||
|
|
||||||
|
def simple_estimator(fac):
|
||||||
|
o = [1]
|
||||||
|
for i in range(1, len(diffs)):
|
||||||
|
o.append(o[-1] * diffs[i] * 1.0 / diffs[i-1] / fac)
|
||||||
|
return o
|
||||||
|
|
||||||
|
|
||||||
|
def minimax_estimator(fac):
|
||||||
|
o = [1]
|
||||||
|
for i in range(1, len(diffs)):
|
||||||
|
if diffs[i] * 1.0 / diffs[i-1] > fac:
|
||||||
|
o.append(o[-1] * diffs[i] * 1.0 / diffs[i-1] / fac)
|
||||||
|
elif diffs[i] > diffs[i-1]:
|
||||||
|
o.append(o[-1])
|
||||||
|
else:
|
||||||
|
o.append(o[-1] * diffs[i] * 1.0 / diffs[i-1])
|
||||||
|
return o
|
||||||
|
|
||||||
|
|
||||||
|
def diff_estimator(fac, dw, mf):
|
||||||
|
o = [1]
|
||||||
|
derivs = [0] * 14
|
||||||
|
for i in range(14, len(diffs)):
|
||||||
|
derivs.append(diffs[i] - diffs[i - 14])
|
||||||
|
for i in range(0, 14):
|
||||||
|
derivs[i] = derivs[14]
|
||||||
|
vals = [max(diffs[i] + derivs[i] * dw, diffs[i] * mf) for i in range(len(diffs))]
|
||||||
|
for i in range(1, len(diffs)):
|
||||||
|
if vals[i] * 1.0 / vals[i-1] > fac:
|
||||||
|
o.append(o[-1] * vals[i] * 1.0 / vals[i-1] / fac)
|
||||||
|
elif vals[i] > vals[i-1]:
|
||||||
|
o.append(o[-1])
|
||||||
|
else:
|
||||||
|
o.append(o[-1] * vals[i] * 1.0 / vals[i-1])
|
||||||
|
return o
|
||||||
|
|
||||||
|
|
||||||
|
def ndiff_estimator(*args):
|
||||||
|
fac, dws, mf = args[0], args[1:-1], args[-1]
|
||||||
|
o = [1]
|
||||||
|
ds = [diffs]
|
||||||
|
for dw in dws:
|
||||||
|
derivs = [0] * 14
|
||||||
|
for i in range(14, len(diffs)):
|
||||||
|
derivs.append(ds[-1][i] - ds[-1][i - 14])
|
||||||
|
for i in range(0, 14):
|
||||||
|
derivs[i] = derivs[14]
|
||||||
|
ds.append(derivs)
|
||||||
|
vals = []
|
||||||
|
for i in range(len(diffs)):
|
||||||
|
q = ds[0][i] + sum([ds[j+1][i] * dws[j] for j in range(len(dws))])
|
||||||
|
vals.append(max(q, ds[0][i] * mf))
|
||||||
|
for i in range(1, len(diffs)):
|
||||||
|
if vals[i] * 1.0 / vals[i-1] > fac:
|
||||||
|
o.append(o[-1] * vals[i] * 1.0 / vals[i-1] / fac)
|
||||||
|
elif vals[i] > vals[i-1]:
|
||||||
|
o.append(o[-1])
|
||||||
|
else:
|
||||||
|
o.append(o[-1] * vals[i] * 1.0 / vals[i-1])
|
||||||
|
return o
|
||||||
|
|
||||||
|
|
||||||
|
def dual_threshold_estimator(fac1, fac2, dmul):
|
||||||
|
o = [1]
|
||||||
|
derivs = [0] * 14
|
||||||
|
for i in range(14, len(diffs)):
|
||||||
|
derivs.append(diffs[i] - diffs[i - 14])
|
||||||
|
for i in range(0, 14):
|
||||||
|
derivs[i] = derivs[14]
|
||||||
|
for i in range(1, len(diffs)):
|
||||||
|
if diffs[i] * 1.0 / diffs[i-1] > fac1 and derivs[i] * 1.0 / derivs[i-1] > fac2:
|
||||||
|
o.append(o[-1] * diffs[i] * 1.0 / diffs[i-1] / fac1 * (1 + (derivs[i] / derivs[i-1] - fac2) * dmul))
|
||||||
|
elif diffs[i] > diffs[i-1]:
|
||||||
|
o.append(o[-1])
|
||||||
|
else:
|
||||||
|
o.append(o[-1] * diffs[i] * 1.0 / diffs[i-1])
|
||||||
|
return o
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_estimates(estimates, crossvalidate=False):
|
||||||
|
sz = len(prices) if crossvalidate else 780
|
||||||
|
sqdiffsum = 0
|
||||||
|
# compute average
|
||||||
|
tot = 0
|
||||||
|
for i in range(sz):
|
||||||
|
tot += math.log(prices[i] / estimates[i])
|
||||||
|
avg = 2.718281828459 ** (tot * 1.0 / sz)
|
||||||
|
for i in range(1, sz):
|
||||||
|
sqdiffsum += math.log(prices[i] / estimates[i] / avg) ** 2
|
||||||
|
return sqdiffsum
|
||||||
|
|
||||||
|
|
||||||
|
# Simulated annealing optimizer
|
||||||
|
def optimize(producer, floors, ceilings, rate=0.7):
|
||||||
|
vals = [f*0.5+c*0.5 for f, c in zip(floors, ceilings)]
|
||||||
|
y = evaluate_estimates(producer(*vals))
|
||||||
|
for i in range(1, 5000):
|
||||||
|
stepsizes = [(f*0.5-c*0.5) / i**rate for f, c in zip(floors, ceilings)]
|
||||||
|
steps = [(random.random() * 2 - 1) * s for s in stepsizes]
|
||||||
|
newvals = [max(mi, min(ma, v+s)) for v, s, mi, ma in zip(vals, steps, floors, ceilings)]
|
||||||
|
newy = evaluate_estimates(producer(*newvals))
|
||||||
|
if newy < y:
|
||||||
|
vals = newvals
|
||||||
|
y = newy
|
||||||
|
if not i % 1000:
|
||||||
|
print i, vals, y
|
||||||
|
|
||||||
|
return vals
|
||||||
|
|
||||||
|
|
||||||
|
def score(producer, *vals):
|
||||||
|
return evaluate_estimates(producer(*vals), True)
|
|
@ -0,0 +1,460 @@
|
||||||
|
import copy, re
|
||||||
|
from sets import Set
|
||||||
|
# NOTE THAT ALL FUNCTIONS HERE WORK BY RETURNING A NEW ARRAY, THERE ARE NO IN-PLACE MODIFICATION METHODS
|
||||||
|
# ALSO KEEP IN MIND THAT COLUMNS AND ROW INDICES START FROM ZERO (ie. column A -> 0, E -> 4, Z -> 25, etc)
|
||||||
|
# Scans a CSV line, keeping track of separators and quotes
|
||||||
|
def scanline(ln,sp=','):
|
||||||
|
arr = []
|
||||||
|
inq = False
|
||||||
|
ind = 0
|
||||||
|
buff = ""
|
||||||
|
while ind < len(ln):
|
||||||
|
if ln[ind] == '"':
|
||||||
|
inq = not inq
|
||||||
|
ind += 1
|
||||||
|
elif ln[ind] == '\\':
|
||||||
|
buff += ln[ind+1]
|
||||||
|
ind += 2
|
||||||
|
elif ln[ind] == sp and not inq:
|
||||||
|
arr.append(buff)
|
||||||
|
buff = ""
|
||||||
|
ind += 1
|
||||||
|
else:
|
||||||
|
buff += ln[ind]
|
||||||
|
ind += 1
|
||||||
|
arr.append(buff)
|
||||||
|
return arr
|
||||||
|
# Load a CSV into a 2D array, automatically filling in unevenly wide rows to make the array square
|
||||||
|
def load(f,sp=','):
|
||||||
|
array = [scanline(x,sp) for x in open(f,'r').readlines()]
|
||||||
|
maxlen = 0
|
||||||
|
for i in range(len(array)):
|
||||||
|
if len(array[i]) > maxlen: maxlen = len(array[i])
|
||||||
|
for i in range(len(array)):
|
||||||
|
if len(array[i]) < maxlen: array[i] += [''] * (maxlen - len(array[i]))
|
||||||
|
return array
|
||||||
|
# Apply a Porter stemmer to every cell in a given range of cols in an array (calling stem with just a list and no cols argument stems _every_ cell)
|
||||||
|
# Example outputs: manage, management, manager, managing -> manag; pony, ponies -> poni; reincarnate, reincarnated, reincarnation -> reincarn
|
||||||
|
def stem(li,cols=0):
|
||||||
|
if cols == 0: cols = range(len(li[0]))
|
||||||
|
import porter
|
||||||
|
pstemmer = porter.PorterStemmer()
|
||||||
|
newlist = copy.deepcopy(li)
|
||||||
|
for i in range(len(li)):
|
||||||
|
for j in cols:
|
||||||
|
string = str(li[i][j])
|
||||||
|
for ch in "'"+'"+[]?!\n': string = string.replace(ch,'')
|
||||||
|
words = string.split(' ')
|
||||||
|
newlist[i][j] = ' '.join([pstemmer.stem(x.strip().lower(),0,len(x.strip())-1) for x in words])
|
||||||
|
return newlist
|
||||||
|
# Is a string a number?
|
||||||
|
def isnumber(s):
|
||||||
|
t = re.findall('^-?[0-9,]*\.[0-9,]*$',s)
|
||||||
|
return len(t) > 0
|
||||||
|
|
||||||
|
# Declutters (removes special characters, numerifies numbers) every cell, rules same as those for stem(li,cols=0)
|
||||||
|
def declutter(li,cols=0):
|
||||||
|
if cols == 0: cols = range(len(li[0]))
|
||||||
|
newlist = copy.deepcopy(li)
|
||||||
|
for i in range(len(li)):
|
||||||
|
for j in cols:
|
||||||
|
string = str(li[i][j])
|
||||||
|
for ch in "'"+'"+[]?!\n': string = string.replace(ch,'')
|
||||||
|
words = string.split(' ')
|
||||||
|
newlist[i][j] = ' '.join([x.strip().lower() for x in words])
|
||||||
|
if isnumber(newlist[i][j]):
|
||||||
|
newlist[i][j] = float(newlist[i][j])
|
||||||
|
return newlist
|
||||||
|
# Generate a list of individual words occurring in a given column in a given array; useful for generating source lists to do n-grams from
|
||||||
|
def wordlist(li,col):
|
||||||
|
wlist = []
|
||||||
|
for i in range(len(li)):
|
||||||
|
words = li[i][col].split(' ')
|
||||||
|
for w in words:
|
||||||
|
if w not in wlist: wlist.append(w)
|
||||||
|
return wlist
|
||||||
|
# Generates a list of phrases (complete cell entries)
|
||||||
|
def phraselist(li,col):
|
||||||
|
wlist = []
|
||||||
|
for i in range(len(li)):
|
||||||
|
phrase= li[i][col]
|
||||||
|
if phrase not in wlist: wlist.append(phrase)
|
||||||
|
return wlist
|
||||||
|
# Retrieve just a few columns from a given array to make a smaller (narrower) array
|
||||||
|
def cols(li,cols):
|
||||||
|
result = []
|
||||||
|
for i in range(len(li)):
|
||||||
|
newline = []
|
||||||
|
for c in cols:
|
||||||
|
if c >= 0: newline.append(li[i][c])
|
||||||
|
else: newline.append(1)
|
||||||
|
result.append(newline)
|
||||||
|
return result
|
||||||
|
# Combine two possibly unsorted arrays matching rows by heading in headingcol1 in li1 and headingcol2 in li2
|
||||||
|
# setting linclusive = True makes sure every row in li1 makes it into the output, same with rinclusive and li2
|
||||||
|
# Recommended to do some kind of sort after splice is done
|
||||||
|
def splice(li1,li2,headingcol1,headingcol2,linclusive=False,rinclusive=False):
|
||||||
|
s1 = sorted(li1,key=lambda x:x[headingcol1],reverse=True)
|
||||||
|
s2 = sorted(li2,key=lambda x:x[headingcol2],reverse=True)
|
||||||
|
l1 = len(s1[0])
|
||||||
|
l2 = len(s2[0])
|
||||||
|
ind1 = 0
|
||||||
|
ind2 = 0
|
||||||
|
output = []
|
||||||
|
while ind1 < len(s1) and ind2 < len(s2):
|
||||||
|
if cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == 1:
|
||||||
|
if rinclusive: output.append([s2[ind2][headingcol2]] + [''] * (l1-1) + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])
|
||||||
|
ind2 += 1
|
||||||
|
elif cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == -1:
|
||||||
|
if linclusive: output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * (l2-1))
|
||||||
|
ind1 += 1
|
||||||
|
else:
|
||||||
|
output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])
|
||||||
|
ind1, ind2 = ind1 + 1, ind2 + 1
|
||||||
|
while ind1 < len(s1) and linclusive:
|
||||||
|
output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * l2)
|
||||||
|
ind1 += 1
|
||||||
|
while ind2 < len(s2) and rinclusive:
|
||||||
|
output.append([s2[ind2][headingcol2]] + [''] * l1 + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])
|
||||||
|
ind2 += 1
|
||||||
|
return output
|
||||||
|
# Creates a wordlist sorted according to function f taken of an array with the results in the addcols in order
|
||||||
|
# eg. sorted_wordlist with addcols = [2,4,6], row is 1 2 4 8 16 32 64, f=lambda x:x[2]+x[1]+1.01*x[0] returns sorting key 84.04
|
||||||
|
def sorted_wordlist(li,wcol,addcols,f=lambda x:x[1],rev=True):
|
||||||
|
return [x[0] for x in sorted(onegrams(li,wcol,addcols),key=f,reverse=rev)]
|
||||||
|
# Utility function, used by twograms, threegrams and fourgrams
|
||||||
|
def compose(arg):
|
||||||
|
return ' '.join(sorted(list(Set(arg))))
|
||||||
|
# Calculate a total sum for every desired column for different exact matches in wcol, column -1 is implied to be 1 for every row
|
||||||
|
# for example, consider the array
|
||||||
|
# dog 20 3
|
||||||
|
# dog house 15 28
|
||||||
|
# cat 25 31
|
||||||
|
# cat 10 7
|
||||||
|
# dog 40 0
|
||||||
|
# house 10 14
|
||||||
|
# Doing pivot(li,0,[1,-1]) gives you the list:
|
||||||
|
# dog 60 2
|
||||||
|
# dog house 15 1
|
||||||
|
# cat 35 2
|
||||||
|
# house 10 1
|
||||||
|
# wlist allows you to restrict the table to a given wordlist
|
||||||
|
def pivot(li, wcol, addcols,wlist=0,sortkey=lambda x:1):
|
||||||
|
if wlist == 0: wlist = phraselist(li,wcol)
|
||||||
|
result = {}
|
||||||
|
for i in range(len(wlist)):
|
||||||
|
result[wlist[i]] = [0] * len(addcols)
|
||||||
|
for i in range(len(li)):
|
||||||
|
nums = []
|
||||||
|
for ac in addcols:
|
||||||
|
if ac >= 0:
|
||||||
|
num = str(li[i][ac]).replace(',','').replace(' ','')
|
||||||
|
if num == '': num = 0
|
||||||
|
elif num[-1] == '%': num = float(num[:-1] * 0.01)
|
||||||
|
else: num = float(num)
|
||||||
|
else: num = 1
|
||||||
|
nums.append(num)
|
||||||
|
if li[i][wcol] in result: result[li[i][wcol]] = [pair[0] + pair[1] for pair in zip(result[li[i][wcol]],nums)]
|
||||||
|
array = []
|
||||||
|
for word in result.keys():
|
||||||
|
array.append([word] + result[word])
|
||||||
|
return sorted(array,key=sortkey,reverse=True)
|
||||||
|
# Similar to a pivot table but looks at individual keywords. The example list above will return with onegrams(li,0,[1,2]):
|
||||||
|
# dog 75 3
|
||||||
|
# cat 35 2
|
||||||
|
# house 25 2
|
||||||
|
def onegrams(li, wcol, addcols,wlist=0,sortkey=lambda x: 1):
|
||||||
|
if wlist == 0: wlist = wordlist(li,wcol)
|
||||||
|
result = {}
|
||||||
|
for i in range(len(wlist)):
|
||||||
|
result[wlist[i]] = [0] * len(addcols)
|
||||||
|
for i in range(len(li)):
|
||||||
|
words = [x.strip() for x in li[i][wcol].split(' ')]
|
||||||
|
nums = []
|
||||||
|
for ac in addcols:
|
||||||
|
if ac >= 0:
|
||||||
|
num = str(li[i][ac]).replace(',','').replace(' ','')
|
||||||
|
if num == '': num = 0
|
||||||
|
elif num[-1] == '%': num = float(num[:-1] * 0.01)
|
||||||
|
else: num = float(num)
|
||||||
|
else: num = 1
|
||||||
|
nums.append(num)
|
||||||
|
for i in range(len(words)):
|
||||||
|
if words[i] in result: result[words[i]] = [pair[0] + pair[1] for pair in zip(result[words[i]],nums)]
|
||||||
|
array = []
|
||||||
|
for word in result.keys():
|
||||||
|
array.append([word] + result[word])
|
||||||
|
return sorted(array,key=sortkey,reverse=True)
|
||||||
|
# Calculate a total sum for every column in addcols and for every word pair in wcol
|
||||||
|
# words do not need to be beside each other or in any particular order, so "buy a dog house", "good house for dog owners", "dog in my house" all go under "dog house"
|
||||||
|
def twograms(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False):
|
||||||
|
if wlist == 0: wlist = wordlist(li,wcol)
|
||||||
|
result = {}
|
||||||
|
if allindices:
|
||||||
|
for i in range(len(wlist)):
|
||||||
|
for j in range(len(wlist)):
|
||||||
|
if i != j: result[compose([wlist[i],wlist[j]])] = [0] * len(addcols)
|
||||||
|
for i in range(len(li)):
|
||||||
|
if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Two grams: " + str(i) + " / " + str(len(li))
|
||||||
|
words = [x.strip() for x in li[i][wcol].split(' ')]
|
||||||
|
nums = []
|
||||||
|
for ac in addcols:
|
||||||
|
if ac >= 0:
|
||||||
|
num = str(li[i][ac]).replace(',','').replace(' ','')
|
||||||
|
if num == '': num = 0
|
||||||
|
elif num[-1] == '%': num = float(num[:-1]) * 0.01
|
||||||
|
else: num = float(num)
|
||||||
|
else: num = 1
|
||||||
|
nums.append(num)
|
||||||
|
for i in range(len(words)):
|
||||||
|
if words[i] in wlist:
|
||||||
|
for j in range(i+1,len(words)):
|
||||||
|
if words[j] in wlist:
|
||||||
|
comb = compose([words[i],words[j]])
|
||||||
|
if comb in result: result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]
|
||||||
|
elif allindices == False: result[comb] = nums
|
||||||
|
array = []
|
||||||
|
for words in result.keys():
|
||||||
|
array.append([words] + result[words])
|
||||||
|
return sorted(array,key=sortkey,reverse=True)
|
||||||
|
# Calculate a total sum for every column in addcols and for every word triplet in wcol (do not need to be beside each other or in any particular order)
|
||||||
|
# setting allindices to True slows down the calculation a lot but gives you a CSV with all possible combinations of words, making it convenient for
|
||||||
|
# working with the same word list on different data
|
||||||
|
def threegrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False):
|
||||||
|
if wlist == 0: wlist = wordlist(li,wcol)
|
||||||
|
result = {}
|
||||||
|
if allindices:
|
||||||
|
for i in range(len(wlist)):
|
||||||
|
for j in range(len(wlist)):
|
||||||
|
for k in range(len(wlist)):
|
||||||
|
if i != j and i != k and j != k: result[compose([wlist[i],wlist[j],wlist[k]])] = [0] * len(addcols)
|
||||||
|
for i in range(len(li)):
|
||||||
|
if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Three grams: " + str(i) + " / " + str(len(li))
|
||||||
|
words = [x.strip() for x in li[i][wcol].split(' ')]
|
||||||
|
nums = []
|
||||||
|
for ac in addcols:
|
||||||
|
if ac >= 0:
|
||||||
|
num = str(li[i][ac]).replace(',','').replace(' ','')
|
||||||
|
if num == '': num = 0
|
||||||
|
elif num[-1] == '%': num = float(num[:-1]) * 0.01
|
||||||
|
else: num = float(num)
|
||||||
|
else: num = 1
|
||||||
|
nums.append(num)
|
||||||
|
for i in range(len(words)):
|
||||||
|
if words[i] in wlist:
|
||||||
|
for j in range(i+1,len(words)):
|
||||||
|
if words[j] in wlist:
|
||||||
|
for k in range(j+1,len(words)):
|
||||||
|
if words[k] in wlist:
|
||||||
|
comb = compose([words[i],words[j],words[k]])
|
||||||
|
if comb in result:
|
||||||
|
result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]
|
||||||
|
elif allindices == False: result[comb] = nums
|
||||||
|
array = []
|
||||||
|
for words in result.keys():
|
||||||
|
array.append([words] + result[words])
|
||||||
|
return sorted(array,key=sortkey,reverse=True)
|
||||||
|
# Calculate a total sum for every column in addcols and for every word quadruplet in wcol
|
||||||
|
def fourgrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1):
|
||||||
|
if wlist == 0: wlist = wordlist(li,wcol)
|
||||||
|
result = {}
|
||||||
|
for i in range(len(li)):
|
||||||
|
if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Four grams: " + str(i) + " / " + str(len(li))
|
||||||
|
words = [x.strip() for x in li[i][wcol].split(' ')]
|
||||||
|
nums = []
|
||||||
|
for ac in addcols:
|
||||||
|
if ac >= 0:
|
||||||
|
num = str(li[i][ac]).replace(',','').replace(' ','')
|
||||||
|
if num == '': num = 0
|
||||||
|
elif num[-1] == '%': num = float(num[:-1]) * 0.01
|
||||||
|
else: num = float(num)
|
||||||
|
else: num = 1
|
||||||
|
nums.append(num)
|
||||||
|
for i in range(len(words)):
|
||||||
|
if words[i] in wlist:
|
||||||
|
for j in range(i+1,len(words)):
|
||||||
|
if words[j] in wlist:
|
||||||
|
for k in range(j+1,len(words)):
|
||||||
|
if words[j] in wlist:
|
||||||
|
for l in range(k+1,len(words)):
|
||||||
|
if words[l] in wlist:
|
||||||
|
comb = compose([words[i],words[j],words[k],words[l]])
|
||||||
|
if comb in result:
|
||||||
|
result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]
|
||||||
|
else: result[comb] = nums
|
||||||
|
|
||||||
|
array = []
|
||||||
|
for words in result.keys():
|
||||||
|
array.append([words] + result[words])
|
||||||
|
return sorted(array,key=sortkey,reverse=True)
|
||||||
|
# Filters array, returning only the rows where column wcol of that row contains the query keywords (keywords can appear in any order)
|
||||||
|
# This and the other filters are useful for taking a list of entries and creating a list of only valid entries according to some validity characteristic
|
||||||
|
# eg:
|
||||||
|
# dog house, 15
|
||||||
|
# cat, 18
|
||||||
|
# dog, 33
|
||||||
|
# filter(li,0,'dog'):
|
||||||
|
# dog house, 15
|
||||||
|
# dog, 33
|
||||||
|
def filter(li,wcol,query):
|
||||||
|
result = []
|
||||||
|
for i in range(len(li)):
|
||||||
|
words = [x.strip() for x in li[i][wcol].split(' ')]
|
||||||
|
inlist = True
|
||||||
|
queryarray = query.split(' ')
|
||||||
|
if queryarray == ['']: queryarray = []
|
||||||
|
for w in queryarray:
|
||||||
|
if w not in words: inlist = False
|
||||||
|
if queryarray == ['*']: inlist = len(li[i][wcol]) > 0
|
||||||
|
if inlist: result.append(li[i])
|
||||||
|
return result
|
||||||
|
# Filters array, requiring column wcol to exactly match query
|
||||||
|
def phrasefilter(li,wcol,query):
|
||||||
|
result = []
|
||||||
|
for i in range(len(li)):
|
||||||
|
if li[i][wcol] == query: result.append(li[i])
|
||||||
|
return result
|
||||||
|
# Filters array, requiring function func taken of the row to return True (or 1)
|
||||||
|
def funcfilter(li,func):
|
||||||
|
result = []
|
||||||
|
for i in range(len(li)):
|
||||||
|
if func(li[i]): result.append(li[i])
|
||||||
|
return result
|
||||||
|
# Adds up columns in addcols for a query matching keyfilter(li,wcol,query); can also be thought of as doing a single n-keyword match
|
||||||
|
# eg:
|
||||||
|
# dog, 25
|
||||||
|
# cat, 15
|
||||||
|
# dog, 75
|
||||||
|
# dog, 10
|
||||||
|
# horse, 55
|
||||||
|
# cat, 7
|
||||||
|
# search(li,0,[1],'dog') gives ['dog',110]
|
||||||
|
def search(li,wcol,addcols,query):
|
||||||
|
result = [0] * len(addcols)
|
||||||
|
for i in range(len(li)):
|
||||||
|
words = [x.strip() for x in li[i][wcol].split(' ')]
|
||||||
|
nums = []
|
||||||
|
for ac in addcols:
|
||||||
|
if ac >= 0:
|
||||||
|
num = str(li[i][ac]).replace(',','').replace(' ','')
|
||||||
|
if num == '': num = 0
|
||||||
|
elif num[-1] == '%': num = float(num[:-1] * 0.01)
|
||||||
|
else: num = float(num)
|
||||||
|
else: num = 1
|
||||||
|
nums.append(num)
|
||||||
|
inlist = True
|
||||||
|
queryarray = query.split(' ')
|
||||||
|
if queryarray == ['']: queryarray = []
|
||||||
|
for w in queryarray:
|
||||||
|
if w not in words: inlist = False
|
||||||
|
if queryarray == ['*']: inlist = len(li[i][wcol]) > 0
|
||||||
|
if inlist:
|
||||||
|
result = [pair[0] + pair[1] for pair in zip(result,nums)]
|
||||||
|
return [query] + result
|
||||||
|
# Print a CSV from an array to stdout
|
||||||
|
def tochars(array,sp=','):
|
||||||
|
string = ""
|
||||||
|
for line in array: string += sp.join([str(x) for x in line]) + '\n'
|
||||||
|
return string[:-1]
|
||||||
|
# Save an array to CSV
|
||||||
|
def save(f,array,sp=','):
|
||||||
|
writeto = open(f,'w')
|
||||||
|
writeto.write(tochars(array,sp))
|
||||||
|
writeto.close()
|
||||||
|
# Compares keywords by two different parameters from two different lists. For example, li1 can be a list of how much money is spent (on addcol1) on a particular combination of keywords (on keycol1) and li2 can be a list of upgraded accounts with the search query they came from on keycol2, and addcol 2 can be left blank to default to -1 (each row is worth one point). Fourth column is statistical significance.
|
||||||
|
# Remember that you may have to filter the list yourself first
|
||||||
|
# Arguments:
|
||||||
|
# grams = 1 for single keywords, 2 for pairs, 3 for triplets and 4 for quadruplets
|
||||||
|
# li1, li2 = your two lists
|
||||||
|
# keycol1, keycol2 = where the keywords are located in those two lists
|
||||||
|
# addcol1, addcol2 = the columns of what you want to add up, eg. cost (set to -1 or leave blank to make it add 1 for each row)
|
||||||
|
# sortkey = function to sort results by (highest first)
|
||||||
|
# usestem = stem keywords
|
||||||
|
# sigtable = add ratio and significance to table
|
||||||
|
# invertratio = set ratio column to col1/col2 instead of col2/col1
|
||||||
|
# preformatted = li1 and li2 are already properly formatted
|
||||||
|
# justpreformat = convert li1 and li2 into twocolumns for comparison but don't go all the way
|
||||||
|
# wordlimit = limit search to some more common keywords for speedup purposes
|
||||||
|
# Example: list of customers, some upgraded, with originating keywords, and a list of how much you're paying for each search phrase
|
||||||
|
#
|
||||||
|
# customers.csv:
|
||||||
|
# Name, Keyword, Status
|
||||||
|
# Bob Jones, spreadsheet csv software, upgraded
|
||||||
|
# Matt Bones, csv python utils, free
|
||||||
|
# Army Drones, free spreadsheet, free
|
||||||
|
# Glenn Mitt, csv software, upgraded
|
||||||
|
# Pat Submitt, python utils software, upgraded
|
||||||
|
# Shawn Wit, python spreadsheet program, upgraded
|
||||||
|
#
|
||||||
|
# costs.csv:
|
||||||
|
# csv software, useless, and, irrelevant, data, 5.00, blah, blah
|
||||||
|
# python spreadsheet, useless, and, irrelevant, data, 2.50, blah, blah
|
||||||
|
# spreadsheet utils, useless, and, irrelevant, data, 10.00, blah, blah
|
||||||
|
# csv utils, useless, and, irrelevant, data, 1.50, blah, blah
|
||||||
|
#
|
||||||
|
# Steps:
|
||||||
|
# 1. import spread (if not imported already)
|
||||||
|
# 2. upgrades = spread.filter(spread.load('customers.csv'),2,'upgraded')
|
||||||
|
# 3. costs = spread.load('costs.csv')
|
||||||
|
# 4. res = compare(1,costs,upgrades,0,1,5,invertratio=True)
|
||||||
|
# 5. spread.save('saved.csv',res)
|
||||||
|
#
|
||||||
|
# Res should look like:
|
||||||
|
#
|
||||||
|
# Keyword, Column 1, Column 2, Ratio, Significance
|
||||||
|
# spreadsheet, 12.50, 2, 6.25, -0.389
|
||||||
|
# utils, 11.50, 1, 11.50, -0.913
|
||||||
|
# csv, 7.50, 2, 3.75, 0.335
|
||||||
|
# python, 2.50, 2, 1.25, 2.031
|
||||||
|
#
|
||||||
|
# Or, if desired, you can:
|
||||||
|
# i1,i2 = compare(1,costs,upgrades,0,1,5,justpreformat=True)
|
||||||
|
# res1 = compare(1,i1,i2,0,1,5,invertratio=True,preformatted=True)
|
||||||
|
# res2 = compare(2,i1,i2,0,1,5,invertratio=True,preformatted=True)
|
||||||
|
# res3 = compare(3,i1,i2,0,1,5,invertratio=True,preformatted=True)
|
||||||
|
# res4 = compare(4,i1,i2,0,1,5,invertratio=True,preformatted=True)
|
||||||
|
#
|
||||||
|
# Note that significance is calculated based on col2/col1 regardless of invertratio, since getting 0 upgrades when you should have gotten 2 is not that unlikely, but calculating significance based on col1/col2 would give you infinity as infinity is infinitely far away from 0.5.
|
||||||
|
def compare(grams,li1,li2,keycol1,keycol2,addcol1=-1,addcol2=-1,sortkey=lambda x:x[1],usestem=True,sigtable=True,invertratio=False,preformatted=False,justpreformat=False,wordlimit=0):
|
||||||
|
gramfuncs = [0,onegrams,twograms,threegrams,fourgrams]
|
||||||
|
if preformatted == False:
|
||||||
|
s1 = declutter(cols(li1,[keycol1,addcol1]),[1])
|
||||||
|
print "Done decluttering/stemming: 1/4"
|
||||||
|
s2 = declutter(cols(li2,[keycol2,addcol2]),[1])
|
||||||
|
print "Done decluttering/stemming: 2/4"
|
||||||
|
s1 = stem(s1,[0]) if usestem else declutter(s1,[0])
|
||||||
|
print "Done decluttering/stemming: 3/4"
|
||||||
|
s2 = stem(s2,[0]) if usestem else declutter(s2,[0])
|
||||||
|
print "Done decluttering/stemming: 4/4"
|
||||||
|
else: s1,s2 = li1,li2
|
||||||
|
print "Printing sample of list 1"
|
||||||
|
print s1[:10]
|
||||||
|
print "Printing sample of list 2"
|
||||||
|
print s2[:10]
|
||||||
|
if justpreformat: return s1,s2
|
||||||
|
while type(s1[0][1]) is str: s1.pop(0)
|
||||||
|
while type(s2[0][1]) is str: s2.pop(0)
|
||||||
|
print "Cleaned invalid rows"
|
||||||
|
wl = sorted_wordlist(s1,0,[1])
|
||||||
|
if wl.count('') > 0: blank = wl.pop(wl.index(''))
|
||||||
|
print "Base wordlist length: " + str(len(wl)) + " ; Top ten: " + str(wl[:10])
|
||||||
|
if wordlimit > 0 and wordlimit < len(wl):
|
||||||
|
print "Shortening to " + str(wordlimit)
|
||||||
|
wl = wl[:wordlimit]
|
||||||
|
res1 = gramfuncs[grams](s1,0,[1],wl)
|
||||||
|
print "Done search: 1/2"
|
||||||
|
res2 = gramfuncs[grams](s2,0,[1],wl)
|
||||||
|
print "Done search: 2/2"
|
||||||
|
comb = sorted(splice(res1,res2,0,0),key=sortkey,reverse=True)
|
||||||
|
if sigtable:
|
||||||
|
tot1 = search(s1,0,[1],'')
|
||||||
|
tot2 = search(s2,0,[1],'')
|
||||||
|
ev = tot2[1]*1.0/tot1[1]
|
||||||
|
print "Totals: " + str(tot1[1]) + ", " + str(tot2[1])
|
||||||
|
for i in range(len(comb)):
|
||||||
|
comb[i].append(comb[i][2 - invertratio]*1.0/(comb[i][1 + invertratio] + 0.000001))
|
||||||
|
comb[i].append((comb[i][2] - ev * comb[i][1])*1.0/(ev * comb[i][1] + 0.000001) ** 0.5)
|
||||||
|
comb = [['Keyword','Column 1','Column 2','Ratio','Significance']] + comb
|
||||||
|
else: comb = [['Keyword','Column 1','Column 2']] + comb
|
||||||
|
print "Done"
|
||||||
|
return comb
|
Loading…
Reference in New Issue