research/stability/spread.py

import copy, re
from sets import Set
# NOTE THAT ALL FUNCTIONS HERE WORK BY RETURNING A NEW ARRAY, THERE ARE NO IN-PLACE MODIFICATION METHODS
# ALSO KEEP IN MIND THAT COLUMNS AND ROW INDICES START FROM ZERO (ie. column A -> 0, E -> 4, Z -> 25, etc)
# Scans a CSV line, keeping track of separators and quotes
def scanline(ln,sp=','):
  arr = []
  inq = False
  ind = 0
  buff = ""
  while ind < len(ln):
    if ln[ind] == '"':
       inq = not inq
       ind += 1
    elif ln[ind] == '\\':
       buff += ln[ind+1]
       ind += 2
    elif ln[ind] == sp and not inq:
       arr.append(buff)
       buff = ""
       ind += 1
    else:
       buff += ln[ind]   
       ind += 1
  arr.append(buff)
  return arr
# Load a CSV into a 2D array, automatically filling in unevenly wide rows to make the array square
def load(f,sp=','):
  array = [scanline(x,sp) for x in open(f,'r').readlines()]
  maxlen = 0
  for i in range(len(array)):
    if len(array[i]) > maxlen: maxlen = len(array[i])
  for i in range(len(array)):
    if len(array[i]) < maxlen: array[i] += [''] * (maxlen - len(array[i]))
  return array
# Apply a Porter stemmer to every cell in a given range of cols in an array (calling stem with just a list and no cols argument stems _every_ cell)
# Example outputs: manage, management, manager, managing -> manag; pony, ponies -> poni; reincarnate, reincarnated, reincarnation -> reincarn
def stem(li,cols=0):
  if cols == 0: cols = range(len(li[0]))
  import porter
  pstemmer = porter.PorterStemmer()
  newlist = copy.deepcopy(li)
  for i in range(len(li)):
    for j in cols:
      string = str(li[i][j])
      for ch in "'"+'"+[]?!\n': string = string.replace(ch,'')
      words = string.split(' ')
      newlist[i][j] = ' '.join([pstemmer.stem(x.strip().lower(),0,len(x.strip())-1) for x in words])
  return newlist
# Is a string a number?
def isnumber(s):
    t = re.findall('^-?[0-9,]*\.[0-9,]*$',s)
    return len(t) > 0
            
# Declutters (removes special characters, numerifies numbers) every cell, rules same as those for stem(li,cols=0)
def declutter(li,cols=0):
  if cols == 0: cols = range(len(li[0]))
  newlist = copy.deepcopy(li)
  for i in range(len(li)):
    for j in cols:
      string = str(li[i][j])
      for ch in "'"+'"+[]?!\n': string = string.replace(ch,'')
      words = string.split(' ')
      newlist[i][j] = ' '.join([x.strip().lower() for x in words])
      if isnumber(newlist[i][j]):
        newlist[i][j] = float(newlist[i][j])
  return newlist
# Generate a list of individual words occurring in a given column in a given array; useful for generating source lists to do n-grams from
def wordlist(li,col):
  wlist = []
  for i in range(len(li)):
      words = li[i][col].split(' ')
      for w in words:
        if w not in wlist: wlist.append(w)
  return wlist
# Generates a list of phrases (complete cell entries)
def phraselist(li,col):
  wlist = []
  for i in range(len(li)):
     phrase= li[i][col]
     if phrase not in wlist: wlist.append(phrase)
  return wlist
# Retrieve just a few columns from a given array to make a smaller (narrower) array
def cols(li,cols):
  result = []
  for i in range(len(li)):
    newline = []
    for c in cols: 
      if c >= 0: newline.append(li[i][c])
      else: newline.append(1)
    result.append(newline)
  return result
# Combine two possibly unsorted arrays matching rows by heading in headingcol1 in li1 and headingcol2 in li2
# setting linclusive = True makes sure every row in li1 makes it into the output, same with rinclusive and li2
# Recommended to do some kind of sort after splice is done
def splice(li1,li2,headingcol1,headingcol2,linclusive=False,rinclusive=False):
  s1 = sorted(li1,key=lambda x:x[headingcol1],reverse=True)
  s2 = sorted(li2,key=lambda x:x[headingcol2],reverse=True)
  l1 = len(s1[0])
  l2 = len(s2[0])
  ind1 = 0
  ind2 = 0
  output = []
  while ind1 < len(s1) and ind2 < len(s2):
    if cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == 1:
      if rinclusive: output.append([s2[ind2][headingcol2]] + [''] * (l1-1) + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])
      ind2 += 1
    elif cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == -1:
      if linclusive: output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * (l2-1))
      ind1 += 1
    else:
      output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])
      ind1, ind2 = ind1 + 1, ind2 + 1
  while ind1 < len(s1) and linclusive:
    output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * l2)
    ind1 += 1
  while ind2 < len(s2) and rinclusive:
    output.append([s2[ind2][headingcol2]] + [''] * l1 + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])
    ind2 += 1
  return output
# Creates a wordlist sorted according to function f taken of an array with the results in the addcols in order
# eg. sorted_wordlist with addcols = [2,4,6], row is 1 2 4 8 16 32 64, f=lambda x:x[2]+x[1]+1.01*x[0] returns sorting key 84.04
def sorted_wordlist(li,wcol,addcols,f=lambda x:x[1],rev=True):
  return [x[0] for x in sorted(onegrams(li,wcol,addcols),key=f,reverse=rev)]
# Utility function, used by twograms, threegrams and fourgrams
def compose(arg):
  return ' '.join(sorted(list(Set(arg))))
# Calculate a total sum for every desired column for different exact matches in wcol, column -1 is implied to be 1 for every row
# for example, consider the array
# dog        20   3
# dog house  15   28
# cat        25   31
# cat        10   7
# dog        40   0
# house      10   14
# Doing pivot(li,0,[1,-1]) gives you the list:
# dog        60   2
# dog house  15   1
# cat        35   2
# house      10   1
# wlist allows you to restrict the table to a given wordlist
def pivot(li, wcol, addcols,wlist=0,sortkey=lambda x:1):
  if wlist == 0: wlist = phraselist(li,wcol)
  result = {}
  for i in range(len(wlist)):
    result[wlist[i]] = [0] * len(addcols)
  for i in range(len(li)):
    nums = []
    for ac in addcols:
      if ac >= 0:
        num = str(li[i][ac]).replace(',','').replace(' ','')
        if num == '': num = 0
        elif num[-1] == '%': num = float(num[:-1] * 0.01)
        else: num = float(num)
      else: num = 1
      nums.append(num)
    if li[i][wcol] in result: result[li[i][wcol]] = [pair[0] + pair[1] for pair in zip(result[li[i][wcol]],nums)]
  array = []
  for word in result.keys():
    array.append([word] + result[word])
  return sorted(array,key=sortkey,reverse=True)
# Similar to a pivot table but looks at individual keywords. The example list above will return with onegrams(li,0,[1,2]):
# dog        75   3
# cat        35   2
# house      25   2
def onegrams(li, wcol, addcols,wlist=0,sortkey=lambda x: 1):
  if wlist == 0: wlist = wordlist(li,wcol) 
  result = {}
  for i in range(len(wlist)):
    result[wlist[i]] = [0] * len(addcols)
  for i in range(len(li)):
    words = [x.strip() for x in li[i][wcol].split(' ')]
    nums = []
    for ac in addcols:
      if ac >= 0: 
        num = str(li[i][ac]).replace(',','').replace(' ','')
        if num == '': num = 0
        elif num[-1] == '%': num = float(num[:-1] * 0.01)
        else: num = float(num)
      else: num = 1
      nums.append(num)
    for i in range(len(words)):
      if words[i] in result: result[words[i]] = [pair[0] + pair[1] for pair in zip(result[words[i]],nums)]
  array = []
  for word in result.keys():
    array.append([word] + result[word])
  return sorted(array,key=sortkey,reverse=True)
# Calculate a total sum for every column in addcols and for every word pair in wcol
# words do not need to be beside each other or in any particular order, so "buy a dog house", "good house for dog owners", "dog in my house" all go under "dog house"
def twograms(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False):
  if wlist == 0: wlist = wordlist(li,wcol) 
  result = {}
  if allindices:
    for i in range(len(wlist)):
      for j in range(len(wlist)):
        if i != j: result[compose([wlist[i],wlist[j]])] = [0] * len(addcols)
  for i in range(len(li)):
    if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Two grams: " + str(i) + " / " + str(len(li))
    words = [x.strip() for x in li[i][wcol].split(' ')]
    nums = []
    for ac in addcols:
      if ac >= 0: 
        num = str(li[i][ac]).replace(',','').replace(' ','')
        if num == '': num = 0
        elif num[-1] == '%': num = float(num[:-1]) * 0.01
        else: num = float(num)
      else: num = 1
      nums.append(num)
    for i in range(len(words)):
      if words[i] in wlist:
        for j in range(i+1,len(words)):
          if words[j] in wlist:
            comb = compose([words[i],words[j]])
            if comb in result: result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]
            elif allindices == False: result[comb] = nums
  array = []
  for words in result.keys():
    array.append([words] + result[words])
  return sorted(array,key=sortkey,reverse=True)
# Calculate a total sum for every column in addcols and for every word triplet in wcol (do not need to be beside each other or in any particular order)
# setting allindices to True slows down the calculation a lot but gives you a CSV with all possible combinations of words, making it convenient for
# working with the same word list on different data
def threegrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False):
  if wlist == 0: wlist = wordlist(li,wcol) 
  result = {}
  if allindices:
    for i in range(len(wlist)):
        for j in range(len(wlist)):
          for k in range(len(wlist)):
              if i != j and i != k and j != k: result[compose([wlist[i],wlist[j],wlist[k]])] = [0] * len(addcols)
  for i in range(len(li)):
    if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Three grams: " + str(i) + " / " + str(len(li))
    words = [x.strip() for x in li[i][wcol].split(' ')]
    nums = []
    for ac in addcols:
      if ac >= 0:
        num = str(li[i][ac]).replace(',','').replace(' ','')
        if num == '': num = 0
        elif num[-1] == '%': num = float(num[:-1]) * 0.01
        else: num = float(num)
      else: num = 1
      nums.append(num)
    for i in range(len(words)):
      if words[i] in wlist:
        for j in range(i+1,len(words)):
          if words[j] in wlist:
            for k in range(j+1,len(words)):
              if words[k] in wlist:
                comb = compose([words[i],words[j],words[k]])
                if comb in result: 
                  result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]
                elif allindices == False: result[comb] = nums
  array = []
  for words in result.keys():
    array.append([words] + result[words])
  return sorted(array,key=sortkey,reverse=True)
# Calculate a total sum for every column in addcols and for every word quadruplet in wcol
def fourgrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1):
  if wlist == 0: wlist = wordlist(li,wcol) 
  result = {}
  for i in range(len(li)):
    if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Four grams: " + str(i) + " / " + str(len(li))
    words = [x.strip() for x in li[i][wcol].split(' ')]
    nums = []
    for ac in addcols:
      if ac >= 0:
        num = str(li[i][ac]).replace(',','').replace(' ','')
        if num == '': num = 0
        elif num[-1] == '%': num = float(num[:-1]) * 0.01
        else: num = float(num)
      else: num = 1
      nums.append(num)
    for i in range(len(words)):
      if words[i] in wlist:
        for j in range(i+1,len(words)):
          if words[j] in wlist:
            for k in range(j+1,len(words)):
              if words[j] in wlist:
                for l in range(k+1,len(words)):
                  if words[l] in wlist:
                    comb = compose([words[i],words[j],words[k],words[l]])
                    if comb in result:
                      result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]
                    else: result[comb] = nums
                  
  array = []
  for words in result.keys():
    array.append([words] + result[words])
  return sorted(array,key=sortkey,reverse=True)
# Filters array, returning only the rows where column wcol of that row contains the query keywords (keywords can appear in any order)
# This and the other filters are useful for taking a list of entries and creating a list of only valid entries according to some validity characteristic
# eg:
# dog house, 15
# cat, 18
# dog, 33
# filter(li,0,'dog'):
# dog house, 15
# dog, 33
def filter(li,wcol,query):
  result = []
  for i in range(len(li)):
    words = [x.strip() for x in li[i][wcol].split(' ')]
    inlist = True
    queryarray = query.split(' ')
    if queryarray == ['']: queryarray = []
    for w in queryarray:
      if w not in words: inlist = False
    if queryarray == ['*']: inlist = len(li[i][wcol]) > 0
    if inlist: result.append(li[i])
  return result
# Filters array, requiring column wcol to exactly match query
def phrasefilter(li,wcol,query):
  result = []
  for i in range(len(li)):
    if li[i][wcol] == query: result.append(li[i])
  return result
# Filters array, requiring function func taken of the row to return True (or 1)
def funcfilter(li,func):
  result = []
  for i in range(len(li)):
    if func(li[i]): result.append(li[i])
  return result
# Adds up columns in addcols for a query matching keyfilter(li,wcol,query); can also be thought of as doing a single n-keyword match
# eg:
# dog, 25
# cat, 15
# dog, 75
# dog, 10
# horse, 55
# cat, 7
# search(li,0,[1],'dog') gives ['dog',110]
def search(li,wcol,addcols,query):
  result = [0] * len(addcols)
  for i in range(len(li)):
    words = [x.strip() for x in li[i][wcol].split(' ')]
    nums = []
    for ac in addcols:
      if ac >= 0:
        num = str(li[i][ac]).replace(',','').replace(' ','')
        if num == '': num = 0
        elif num[-1] == '%': num = float(num[:-1] * 0.01)
        else: num = float(num)
      else: num = 1
      nums.append(num)
    inlist = True
    queryarray = query.split(' ')
    if queryarray == ['']: queryarray = []
    for w in queryarray:
      if w not in words: inlist = False
    if queryarray == ['*']: inlist = len(li[i][wcol]) > 0
    if inlist:
      result = [pair[0] + pair[1] for pair in zip(result,nums)]
  return [query] + result
# Print a CSV from an array to stdout
def tochars(array,sp=','):
  string = ""
  for line in array: string += sp.join([str(x) for x in line]) + '\n'
  return string[:-1]
# Save an array to CSV
def save(f,array,sp=','):
  writeto = open(f,'w')
  writeto.write(tochars(array,sp))
  writeto.close()
# Compares keywords by two different parameters from two different lists. For example, li1 can be a list of how much money is spent (on addcol1) on a particular combination of keywords (on keycol1) and li2 can be a list of upgraded accounts with the search query they came from on keycol2, and addcol 2 can be left blank to default to -1 (each row is worth one point). Fourth column is statistical significance.
# Remember that you may have to filter the list yourself first
# Arguments:
# grams = 1 for single keywords, 2 for pairs, 3 for triplets and 4 for quadruplets
# li1, li2 = your two lists
# keycol1, keycol2 = where the keywords are located in those two lists
# addcol1, addcol2 = the columns of what you want to add up, eg. cost (set to -1 or leave blank to make it add 1 for each row)
# sortkey = function to sort results by (highest first)
# usestem = stem keywords
# sigtable = add ratio and significance to table
# invertratio = set ratio column to col1/col2 instead of col2/col1
# preformatted = li1 and li2 are already properly formatted
# justpreformat = convert li1 and li2 into twocolumns for comparison but don't go all the way
# wordlimit = limit search to some more common keywords for speedup purposes
# Example: list of customers, some upgraded, with originating keywords, and a list of how much you're paying for each search phrase
#
# customers.csv:
# Name, Keyword, Status
# Bob Jones, spreadsheet csv software, upgraded
# Matt Bones, csv python utils, free
# Army Drones, free spreadsheet, free
# Glenn Mitt, csv software, upgraded
# Pat Submitt, python utils software, upgraded
# Shawn Wit, python spreadsheet program, upgraded
#
# costs.csv:
# csv software, useless, and, irrelevant, data, 5.00, blah, blah
# python spreadsheet, useless, and, irrelevant, data, 2.50, blah, blah
# spreadsheet utils, useless, and, irrelevant, data, 10.00, blah, blah
# csv utils, useless, and, irrelevant, data, 1.50, blah, blah
# 
# Steps:
# 1. import spread (if not imported already)
# 2. upgrades = spread.filter(spread.load('customers.csv'),2,'upgraded')
# 3. costs = spread.load('costs.csv')
# 4. res = compare(1,costs,upgrades,0,1,5,invertratio=True)
# 5. spread.save('saved.csv',res)
#
# Res should look like:
#
# Keyword, Column 1, Column 2, Ratio, Significance
# spreadsheet, 12.50, 2, 6.25, -0.389
# utils, 11.50, 1, 11.50, -0.913
# csv, 7.50, 2, 3.75, 0.335
# python, 2.50, 2, 1.25, 2.031
#
# Or, if desired, you can:
# i1,i2 = compare(1,costs,upgrades,0,1,5,justpreformat=True)
# res1 = compare(1,i1,i2,0,1,5,invertratio=True,preformatted=True)
# res2 = compare(2,i1,i2,0,1,5,invertratio=True,preformatted=True)
# res3 = compare(3,i1,i2,0,1,5,invertratio=True,preformatted=True)
# res4 = compare(4,i1,i2,0,1,5,invertratio=True,preformatted=True)
#
# Note that significance is calculated based on col2/col1 regardless of invertratio, since getting 0 upgrades when you should have gotten 2 is not that unlikely, but calculating significance based on col1/col2 would give you infinity as infinity is infinitely far away from 0.5.
def compare(grams,li1,li2,keycol1,keycol2,addcol1=-1,addcol2=-1,sortkey=lambda x:x[1],usestem=True,sigtable=True,invertratio=False,preformatted=False,justpreformat=False,wordlimit=0):
  gramfuncs = [0,onegrams,twograms,threegrams,fourgrams]
  if preformatted == False:
    s1 = declutter(cols(li1,[keycol1,addcol1]),[1])
    print "Done decluttering/stemming: 1/4"
    s2 = declutter(cols(li2,[keycol2,addcol2]),[1])
    print "Done decluttering/stemming: 2/4"
    s1 = stem(s1,[0]) if usestem else declutter(s1,[0])
    print "Done decluttering/stemming: 3/4"
    s2 = stem(s2,[0]) if usestem else declutter(s2,[0])
    print "Done decluttering/stemming: 4/4"
  else: s1,s2 = li1,li2
  print "Printing sample of list 1"
  print s1[:10]
  print "Printing sample of list 2"
  print s2[:10]
  if justpreformat: return s1,s2
  while type(s1[0][1]) is str: s1.pop(0)
  while type(s2[0][1]) is str: s2.pop(0)
  print "Cleaned invalid rows"
  wl = sorted_wordlist(s1,0,[1])
  if wl.count('') > 0: blank = wl.pop(wl.index(''))
  print "Base wordlist length: " + str(len(wl)) + " ; Top ten: " + str(wl[:10])
  if wordlimit > 0 and wordlimit < len(wl):
    print "Shortening to " + str(wordlimit)
    wl = wl[:wordlimit]
  res1 = gramfuncs[grams](s1,0,[1],wl)
  print "Done search: 1/2"
  res2 = gramfuncs[grams](s2,0,[1],wl)
  print "Done search: 2/2"
  comb = sorted(splice(res1,res2,0,0),key=sortkey,reverse=True)
  if sigtable:
    tot1 = search(s1,0,[1],'')
    tot2 = search(s2,0,[1],'')
    ev = tot2[1]*1.0/tot1[1]
    print "Totals: " + str(tot1[1]) + ", " + str(tot2[1])
    for i in range(len(comb)): 
      comb[i].append(comb[i][2 - invertratio]*1.0/(comb[i][1 + invertratio] + 0.000001))
      comb[i].append((comb[i][2] - ev * comb[i][1])*1.0/(ev * comb[i][1] + 0.000001) ** 0.5)
    comb = [['Keyword','Column 1','Column 2','Ratio','Significance']] + comb
  else: comb = [['Keyword','Column 1','Column 2']] + comb
  print "Done"
  return comb
Added estimators 2014-11-09 18:18:48 +00:00			`import copy, re`
			`from sets import Set`
			`# NOTE THAT ALL FUNCTIONS HERE WORK BY RETURNING A NEW ARRAY, THERE ARE NO IN-PLACE MODIFICATION METHODS`
			`# ALSO KEEP IN MIND THAT COLUMNS AND ROW INDICES START FROM ZERO (ie. column A -> 0, E -> 4, Z -> 25, etc)`
			`# Scans a CSV line, keeping track of separators and quotes`
			`def scanline(ln,sp=','):`
			`arr = []`
			`inq = False`
			`ind = 0`
			`buff = ""`
			`while ind < len(ln):`
			`if ln[ind] == '"':`
			`inq = not inq`
			`ind += 1`
			`elif ln[ind] == '\\':`
			`buff += ln[ind+1]`
			`ind += 2`
			`elif ln[ind] == sp and not inq:`
			`arr.append(buff)`
			`buff = ""`
			`ind += 1`
			`else:`
			`buff += ln[ind]`
			`ind += 1`
			`arr.append(buff)`
			`return arr`
			`# Load a CSV into a 2D array, automatically filling in unevenly wide rows to make the array square`
			`def load(f,sp=','):`
			`array = [scanline(x,sp) for x in open(f,'r').readlines()]`
			`maxlen = 0`
			`for i in range(len(array)):`
			`if len(array[i]) > maxlen: maxlen = len(array[i])`
			`for i in range(len(array)):`
			`if len(array[i]) < maxlen: array[i] += [''] * (maxlen - len(array[i]))`
			`return array`
			`# Apply a Porter stemmer to every cell in a given range of cols in an array (calling stem with just a list and no cols argument stems _every_ cell)`
			`# Example outputs: manage, management, manager, managing -> manag; pony, ponies -> poni; reincarnate, reincarnated, reincarnation -> reincarn`
			`def stem(li,cols=0):`
			`if cols == 0: cols = range(len(li[0]))`
			`import porter`
			`pstemmer = porter.PorterStemmer()`
			`newlist = copy.deepcopy(li)`
			`for i in range(len(li)):`
			`for j in cols:`
			`string = str(li[i][j])`
			`for ch in "'"+'"+[]?!\n': string = string.replace(ch,'')`
			`words = string.split(' ')`
			`newlist[i][j] = ' '.join([pstemmer.stem(x.strip().lower(),0,len(x.strip())-1) for x in words])`
			`return newlist`
			`# Is a string a number?`
			`def isnumber(s):`
			`t = re.findall('^-?[0-9,]\.[0-9,]$',s)`
			`return len(t) > 0`

			`# Declutters (removes special characters, numerifies numbers) every cell, rules same as those for stem(li,cols=0)`
			`def declutter(li,cols=0):`
			`if cols == 0: cols = range(len(li[0]))`
			`newlist = copy.deepcopy(li)`
			`for i in range(len(li)):`
			`for j in cols:`
			`string = str(li[i][j])`
			`for ch in "'"+'"+[]?!\n': string = string.replace(ch,'')`
			`words = string.split(' ')`
			`newlist[i][j] = ' '.join([x.strip().lower() for x in words])`
			`if isnumber(newlist[i][j]):`
			`newlist[i][j] = float(newlist[i][j])`
			`return newlist`
			`# Generate a list of individual words occurring in a given column in a given array; useful for generating source lists to do n-grams from`
			`def wordlist(li,col):`
			`wlist = []`
			`for i in range(len(li)):`
			`words = li[i][col].split(' ')`
			`for w in words:`
			`if w not in wlist: wlist.append(w)`
			`return wlist`
			`# Generates a list of phrases (complete cell entries)`
			`def phraselist(li,col):`
			`wlist = []`
			`for i in range(len(li)):`
			`phrase= li[i][col]`
			`if phrase not in wlist: wlist.append(phrase)`
			`return wlist`
			`# Retrieve just a few columns from a given array to make a smaller (narrower) array`
			`def cols(li,cols):`
			`result = []`
			`for i in range(len(li)):`
			`newline = []`
			`for c in cols:`
			`if c >= 0: newline.append(li[i][c])`
			`else: newline.append(1)`
			`result.append(newline)`
			`return result`
			`# Combine two possibly unsorted arrays matching rows by heading in headingcol1 in li1 and headingcol2 in li2`
			`# setting linclusive = True makes sure every row in li1 makes it into the output, same with rinclusive and li2`
			`# Recommended to do some kind of sort after splice is done`
			`def splice(li1,li2,headingcol1,headingcol2,linclusive=False,rinclusive=False):`
			`s1 = sorted(li1,key=lambda x:x[headingcol1],reverse=True)`
			`s2 = sorted(li2,key=lambda x:x[headingcol2],reverse=True)`
			`l1 = len(s1[0])`
			`l2 = len(s2[0])`
			`ind1 = 0`
			`ind2 = 0`
			`output = []`
			`while ind1 < len(s1) and ind2 < len(s2):`
			`if cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == 1:`
			`if rinclusive: output.append([s2[ind2][headingcol2]] + [''] * (l1-1) + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])`
			`ind2 += 1`
			`elif cmp(s2[ind2][headingcol2],s1[ind1][headingcol1]) == -1:`
			`if linclusive: output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * (l2-1))`
			`ind1 += 1`
			`else:`
			`output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])`
			`ind1, ind2 = ind1 + 1, ind2 + 1`
			`while ind1 < len(s1) and linclusive:`
			`output.append([s1[ind1][headingcol1]] + s1[ind1][:headingcol1] + s1[ind1][headingcol1 + 1:] + [''] * l2)`
			`ind1 += 1`
			`while ind2 < len(s2) and rinclusive:`
			`output.append([s2[ind2][headingcol2]] + [''] * l1 + s2[ind2][:headingcol2] + s2[ind2][headingcol2 + 1:])`
			`ind2 += 1`
			`return output`
			`# Creates a wordlist sorted according to function f taken of an array with the results in the addcols in order`
			`# eg. sorted_wordlist with addcols = [2,4,6], row is 1 2 4 8 16 32 64, f=lambda x:x[2]+x[1]+1.01*x[0] returns sorting key 84.04`
			`def sorted_wordlist(li,wcol,addcols,f=lambda x:x[1],rev=True):`
			`return [x[0] for x in sorted(onegrams(li,wcol,addcols),key=f,reverse=rev)]`
			`# Utility function, used by twograms, threegrams and fourgrams`
			`def compose(arg):`
			`return ' '.join(sorted(list(Set(arg))))`
			`# Calculate a total sum for every desired column for different exact matches in wcol, column -1 is implied to be 1 for every row`
			`# for example, consider the array`
			`# dog 20 3`
			`# dog house 15 28`
			`# cat 25 31`
			`# cat 10 7`
			`# dog 40 0`
			`# house 10 14`
			`# Doing pivot(li,0,[1,-1]) gives you the list:`
			`# dog 60 2`
			`# dog house 15 1`
			`# cat 35 2`
			`# house 10 1`
			`# wlist allows you to restrict the table to a given wordlist`
			`def pivot(li, wcol, addcols,wlist=0,sortkey=lambda x:1):`
			`if wlist == 0: wlist = phraselist(li,wcol)`
			`result = {}`
			`for i in range(len(wlist)):`
			`result[wlist[i]] = [0] * len(addcols)`
			`for i in range(len(li)):`
			`nums = []`
			`for ac in addcols:`
			`if ac >= 0:`
			`num = str(li[i][ac]).replace(',','').replace(' ','')`
			`if num == '': num = 0`
			`elif num[-1] == '%': num = float(num[:-1] * 0.01)`
			`else: num = float(num)`
			`else: num = 1`
			`nums.append(num)`
			`if li[i][wcol] in result: result[li[i][wcol]] = [pair[0] + pair[1] for pair in zip(result[li[i][wcol]],nums)]`
			`array = []`
			`for word in result.keys():`
			`array.append([word] + result[word])`
			`return sorted(array,key=sortkey,reverse=True)`
			`# Similar to a pivot table but looks at individual keywords. The example list above will return with onegrams(li,0,[1,2]):`
			`# dog 75 3`
			`# cat 35 2`
			`# house 25 2`
			`def onegrams(li, wcol, addcols,wlist=0,sortkey=lambda x: 1):`
			`if wlist == 0: wlist = wordlist(li,wcol)`
			`result = {}`
			`for i in range(len(wlist)):`
			`result[wlist[i]] = [0] * len(addcols)`
			`for i in range(len(li)):`
			`words = [x.strip() for x in li[i][wcol].split(' ')]`
			`nums = []`
			`for ac in addcols:`
			`if ac >= 0:`
			`num = str(li[i][ac]).replace(',','').replace(' ','')`
			`if num == '': num = 0`
			`elif num[-1] == '%': num = float(num[:-1] * 0.01)`
			`else: num = float(num)`
			`else: num = 1`
			`nums.append(num)`
			`for i in range(len(words)):`
			`if words[i] in result: result[words[i]] = [pair[0] + pair[1] for pair in zip(result[words[i]],nums)]`
			`array = []`
			`for word in result.keys():`
			`array.append([word] + result[word])`
			`return sorted(array,key=sortkey,reverse=True)`
			`# Calculate a total sum for every column in addcols and for every word pair in wcol`
			`# words do not need to be beside each other or in any particular order, so "buy a dog house", "good house for dog owners", "dog in my house" all go under "dog house"`
			`def twograms(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False):`
			`if wlist == 0: wlist = wordlist(li,wcol)`
			`result = {}`
			`if allindices:`
			`for i in range(len(wlist)):`
			`for j in range(len(wlist)):`
			`if i != j: result[compose([wlist[i],wlist[j]])] = [0] * len(addcols)`
			`for i in range(len(li)):`
			`if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Two grams: " + str(i) + " / " + str(len(li))`
			`words = [x.strip() for x in li[i][wcol].split(' ')]`
			`nums = []`
			`for ac in addcols:`
			`if ac >= 0:`
			`num = str(li[i][ac]).replace(',','').replace(' ','')`
			`if num == '': num = 0`
			`elif num[-1] == '%': num = float(num[:-1]) * 0.01`
			`else: num = float(num)`
			`else: num = 1`
			`nums.append(num)`
			`for i in range(len(words)):`
			`if words[i] in wlist:`
			`for j in range(i+1,len(words)):`
			`if words[j] in wlist:`
			`comb = compose([words[i],words[j]])`
			`if comb in result: result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]`
			`elif allindices == False: result[comb] = nums`
			`array = []`
			`for words in result.keys():`
			`array.append([words] + result[words])`
			`return sorted(array,key=sortkey,reverse=True)`
			`# Calculate a total sum for every column in addcols and for every word triplet in wcol (do not need to be beside each other or in any particular order)`
			`# setting allindices to True slows down the calculation a lot but gives you a CSV with all possible combinations of words, making it convenient for`
			`# working with the same word list on different data`
			`def threegrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1,allindices=False):`
			`if wlist == 0: wlist = wordlist(li,wcol)`
			`result = {}`
			`if allindices:`
			`for i in range(len(wlist)):`
			`for j in range(len(wlist)):`
			`for k in range(len(wlist)):`
			`if i != j and i != k and j != k: result[compose([wlist[i],wlist[j],wlist[k]])] = [0] * len(addcols)`
			`for i in range(len(li)):`
			`if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Three grams: " + str(i) + " / " + str(len(li))`
			`words = [x.strip() for x in li[i][wcol].split(' ')]`
			`nums = []`
			`for ac in addcols:`
			`if ac >= 0:`
			`num = str(li[i][ac]).replace(',','').replace(' ','')`
			`if num == '': num = 0`
			`elif num[-1] == '%': num = float(num[:-1]) * 0.01`
			`else: num = float(num)`
			`else: num = 1`
			`nums.append(num)`
			`for i in range(len(words)):`
			`if words[i] in wlist:`
			`for j in range(i+1,len(words)):`
			`if words[j] in wlist:`
			`for k in range(j+1,len(words)):`
			`if words[k] in wlist:`
			`comb = compose([words[i],words[j],words[k]])`
			`if comb in result:`
			`result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]`
			`elif allindices == False: result[comb] = nums`
			`array = []`
			`for words in result.keys():`
			`array.append([words] + result[words])`
			`return sorted(array,key=sortkey,reverse=True)`
			`# Calculate a total sum for every column in addcols and for every word quadruplet in wcol`
			`def fourgrams(li,wcol,addcols,wlist=0,sortkey=lambda x:1):`
			`if wlist == 0: wlist = wordlist(li,wcol)`
			`result = {}`
			`for i in range(len(li)):`
			`if i % int(len(li)/10) == (int(len(li)/10) - 1): print "Four grams: " + str(i) + " / " + str(len(li))`
			`words = [x.strip() for x in li[i][wcol].split(' ')]`
			`nums = []`
			`for ac in addcols:`
			`if ac >= 0:`
			`num = str(li[i][ac]).replace(',','').replace(' ','')`
			`if num == '': num = 0`
			`elif num[-1] == '%': num = float(num[:-1]) * 0.01`
			`else: num = float(num)`
			`else: num = 1`
			`nums.append(num)`
			`for i in range(len(words)):`
			`if words[i] in wlist:`
			`for j in range(i+1,len(words)):`
			`if words[j] in wlist:`
			`for k in range(j+1,len(words)):`
			`if words[j] in wlist:`
			`for l in range(k+1,len(words)):`
			`if words[l] in wlist:`
			`comb = compose([words[i],words[j],words[k],words[l]])`
			`if comb in result:`
			`result[comb] = [pair[0] + pair[1] for pair in zip(result[comb],nums)]`
			`else: result[comb] = nums`

			`array = []`
			`for words in result.keys():`
			`array.append([words] + result[words])`
			`return sorted(array,key=sortkey,reverse=True)`
			`# Filters array, returning only the rows where column wcol of that row contains the query keywords (keywords can appear in any order)`
			`# This and the other filters are useful for taking a list of entries and creating a list of only valid entries according to some validity characteristic`
			`# eg:`
			`# dog house, 15`
			`# cat, 18`
			`# dog, 33`
			`# filter(li,0,'dog'):`
			`# dog house, 15`
			`# dog, 33`
			`def filter(li,wcol,query):`
			`result = []`
			`for i in range(len(li)):`
			`words = [x.strip() for x in li[i][wcol].split(' ')]`
			`inlist = True`
			`queryarray = query.split(' ')`
			`if queryarray == ['']: queryarray = []`
			`for w in queryarray:`
			`if w not in words: inlist = False`
			`if queryarray == ['*']: inlist = len(li[i][wcol]) > 0`
			`if inlist: result.append(li[i])`
			`return result`
			`# Filters array, requiring column wcol to exactly match query`
			`def phrasefilter(li,wcol,query):`
			`result = []`
			`for i in range(len(li)):`
			`if li[i][wcol] == query: result.append(li[i])`
			`return result`
			`# Filters array, requiring function func taken of the row to return True (or 1)`
			`def funcfilter(li,func):`
			`result = []`
			`for i in range(len(li)):`
			`if func(li[i]): result.append(li[i])`
			`return result`
			`# Adds up columns in addcols for a query matching keyfilter(li,wcol,query); can also be thought of as doing a single n-keyword match`
			`# eg:`
			`# dog, 25`
			`# cat, 15`
			`# dog, 75`
			`# dog, 10`
			`# horse, 55`
			`# cat, 7`
			`# search(li,0,[1],'dog') gives ['dog',110]`
			`def search(li,wcol,addcols,query):`
			`result = [0] * len(addcols)`
			`for i in range(len(li)):`
			`words = [x.strip() for x in li[i][wcol].split(' ')]`
			`nums = []`
			`for ac in addcols:`
			`if ac >= 0:`
			`num = str(li[i][ac]).replace(',','').replace(' ','')`
			`if num == '': num = 0`
			`elif num[-1] == '%': num = float(num[:-1] * 0.01)`
			`else: num = float(num)`
			`else: num = 1`
			`nums.append(num)`
			`inlist = True`
			`queryarray = query.split(' ')`
			`if queryarray == ['']: queryarray = []`
			`for w in queryarray:`
			`if w not in words: inlist = False`
			`if queryarray == ['*']: inlist = len(li[i][wcol]) > 0`
			`if inlist:`
			`result = [pair[0] + pair[1] for pair in zip(result,nums)]`
			`return [query] + result`
			`# Print a CSV from an array to stdout`
			`def tochars(array,sp=','):`
			`string = ""`
			`for line in array: string += sp.join([str(x) for x in line]) + '\n'`
			`return string[:-1]`
			`# Save an array to CSV`
			`def save(f,array,sp=','):`
			`writeto = open(f,'w')`
			`writeto.write(tochars(array,sp))`
			`writeto.close()`
			`# Compares keywords by two different parameters from two different lists. For example, li1 can be a list of how much money is spent (on addcol1) on a particular combination of keywords (on keycol1) and li2 can be a list of upgraded accounts with the search query they came from on keycol2, and addcol 2 can be left blank to default to -1 (each row is worth one point). Fourth column is statistical significance.`
			`# Remember that you may have to filter the list yourself first`
			`# Arguments:`
			`# grams = 1 for single keywords, 2 for pairs, 3 for triplets and 4 for quadruplets`
			`# li1, li2 = your two lists`
			`# keycol1, keycol2 = where the keywords are located in those two lists`
			`# addcol1, addcol2 = the columns of what you want to add up, eg. cost (set to -1 or leave blank to make it add 1 for each row)`
			`# sortkey = function to sort results by (highest first)`
			`# usestem = stem keywords`
			`# sigtable = add ratio and significance to table`
			`# invertratio = set ratio column to col1/col2 instead of col2/col1`
			`# preformatted = li1 and li2 are already properly formatted`
			`# justpreformat = convert li1 and li2 into twocolumns for comparison but don't go all the way`
			`# wordlimit = limit search to some more common keywords for speedup purposes`
			`# Example: list of customers, some upgraded, with originating keywords, and a list of how much you're paying for each search phrase`
			`#`
			`# customers.csv:`
			`# Name, Keyword, Status`
			`# Bob Jones, spreadsheet csv software, upgraded`
			`# Matt Bones, csv python utils, free`
			`# Army Drones, free spreadsheet, free`
			`# Glenn Mitt, csv software, upgraded`
			`# Pat Submitt, python utils software, upgraded`
			`# Shawn Wit, python spreadsheet program, upgraded`
			`#`
			`# costs.csv:`
			`# csv software, useless, and, irrelevant, data, 5.00, blah, blah`
			`# python spreadsheet, useless, and, irrelevant, data, 2.50, blah, blah`
			`# spreadsheet utils, useless, and, irrelevant, data, 10.00, blah, blah`
			`# csv utils, useless, and, irrelevant, data, 1.50, blah, blah`
			`#`
			`# Steps:`
			`# 1. import spread (if not imported already)`
			`# 2. upgrades = spread.filter(spread.load('customers.csv'),2,'upgraded')`
			`# 3. costs = spread.load('costs.csv')`
			`# 4. res = compare(1,costs,upgrades,0,1,5,invertratio=True)`
			`# 5. spread.save('saved.csv',res)`
			`#`
			`# Res should look like:`
			`#`
			`# Keyword, Column 1, Column 2, Ratio, Significance`
			`# spreadsheet, 12.50, 2, 6.25, -0.389`
			`# utils, 11.50, 1, 11.50, -0.913`
			`# csv, 7.50, 2, 3.75, 0.335`
			`# python, 2.50, 2, 1.25, 2.031`
			`#`
			`# Or, if desired, you can:`
			`# i1,i2 = compare(1,costs,upgrades,0,1,5,justpreformat=True)`
			`# res1 = compare(1,i1,i2,0,1,5,invertratio=True,preformatted=True)`
			`# res2 = compare(2,i1,i2,0,1,5,invertratio=True,preformatted=True)`
			`# res3 = compare(3,i1,i2,0,1,5,invertratio=True,preformatted=True)`
			`# res4 = compare(4,i1,i2,0,1,5,invertratio=True,preformatted=True)`
			`#`
			`# Note that significance is calculated based on col2/col1 regardless of invertratio, since getting 0 upgrades when you should have gotten 2 is not that unlikely, but calculating significance based on col1/col2 would give you infinity as infinity is infinitely far away from 0.5.`
			`def compare(grams,li1,li2,keycol1,keycol2,addcol1=-1,addcol2=-1,sortkey=lambda x:x[1],usestem=True,sigtable=True,invertratio=False,preformatted=False,justpreformat=False,wordlimit=0):`
			`gramfuncs = [0,onegrams,twograms,threegrams,fourgrams]`
			`if preformatted == False:`
			`s1 = declutter(cols(li1,[keycol1,addcol1]),[1])`
			`print "Done decluttering/stemming: 1/4"`
			`s2 = declutter(cols(li2,[keycol2,addcol2]),[1])`
			`print "Done decluttering/stemming: 2/4"`
			`s1 = stem(s1,[0]) if usestem else declutter(s1,[0])`
			`print "Done decluttering/stemming: 3/4"`
			`s2 = stem(s2,[0]) if usestem else declutter(s2,[0])`
			`print "Done decluttering/stemming: 4/4"`
			`else: s1,s2 = li1,li2`
			`print "Printing sample of list 1"`
			`print s1[:10]`
			`print "Printing sample of list 2"`
			`print s2[:10]`
			`if justpreformat: return s1,s2`
			`while type(s1[0][1]) is str: s1.pop(0)`
			`while type(s2[0][1]) is str: s2.pop(0)`
			`print "Cleaned invalid rows"`
			`wl = sorted_wordlist(s1,0,[1])`
			`if wl.count('') > 0: blank = wl.pop(wl.index(''))`
			`print "Base wordlist length: " + str(len(wl)) + " ; Top ten: " + str(wl[:10])`
			`if wordlimit > 0 and wordlimit < len(wl):`
			`print "Shortening to " + str(wordlimit)`
			`wl = wl[:wordlimit]`
			`res1 = gramfuncs[grams](s1,0,[1],wl)`
			`print "Done search: 1/2"`
			`res2 = gramfuncs[grams](s2,0,[1],wl)`
			`print "Done search: 2/2"`
			`comb = sorted(splice(res1,res2,0,0),key=sortkey,reverse=True)`
			`if sigtable:`
			`tot1 = search(s1,0,[1],'')`
			`tot2 = search(s2,0,[1],'')`
			`ev = tot2[1]*1.0/tot1[1]`
			`print "Totals: " + str(tot1[1]) + ", " + str(tot2[1])`
			`for i in range(len(comb)):`
			`comb[i].append(comb[i][2 - invertratio]*1.0/(comb[i][1 + invertratio] + 0.000001))`
			`comb[i].append((comb[i][2] - ev * comb[i][1])1.0/(ev comb[i][1] + 0.000001) ** 0.5)`
			`comb = [['Keyword','Column 1','Column 2','Ratio','Significance']] + comb`
			`else: comb = [['Keyword','Column 1','Column 2']] + comb`
			`print "Done"`
			`return comb`