Module fish
SYNOPSIS:
less: look around just a little, guess where to search.
(c) 2023 Tim Menzies timm@ieee.org, BSD-2
USAGE:
./fish.py -f csvfile [OPTIONS] [ -g ACTION ]
cat csvfile | ./fish.py [OPTIONS] [ -g ACTION ]
OPTIONS:
-b
–bins
max number of bins
= 16
-c
–cohen
size significant separation = .35
-f
–file
data csv file
= ../data/auto93.csv
-g
–go
start-up action
= nothing
-h
–help
show help
= False
-l
–lazy
lazy mode
= False
-m
–min
min size
= .5
-p
–p
distance coeffecient
= 2
-r
–rest
ratio best:rest
= 4
-s
–seed
random number seed
= 1234567891
-t
–top
explore top
ranges
= 8
-w
–want
goal
= mitigate
Expand source code
#!/usr/bin/env python3 -B
#vim: set et sts=2 sw=2 ts=2 :
"""
SYNOPSIS:
less: look around just a little, guess where to search.
(c) 2023 Tim Menzies <timm@ieee.org>, BSD-2
USAGE:
./fish.py -f csvfile [OPTIONS] [ -g ACTION ]
cat csvfile | ./fish.py [OPTIONS] [ -g ACTION ]
OPTIONS:
-b --bins max number of bins = 16
-c --cohen size significant separation = .35
-f --file data csv file = ../data/auto93.csv
-g --go start-up action = nothing
-h --help show help = False
-l --lazy lazy mode = False
-m --min min size = .5
-p --p distance coeffecient = 2
-r --rest ratio best:rest = 4
-s --seed random number seed = 1234567891
-t --top explore top ranges = 8
-w --want goal = mitigate"""
from fileinput import FileInput as file_or_stdin
import traceback,random,math,sys,re
from termcolor import colored
from functools import cmp_to_key
from ast import literal_eval
#---------------------------------------------
class pretty(object):
"Objects support pretty print, hiding privates slots (those starting with `_`)"
def __repr__(i):
return i.__class__.__name__+str({k:v for k,v in i.__dict__.items() if k[0] != "_"})
#---------------------------------------------
class ROW(pretty):
"Place to store cells and meta-knowledge about those cells."
def __init__(i, cells=[]): i.cells,i.klass = cells,None
def at(i,col): return i.cells[col.at]
#---------------------------------------------
class COL(pretty):
"COLumns know the name, position and the count of rows seen."
def __init__(i, txt="",at=0): i.n,i.at,i.txt = 0,at,txt
def add(i,x):
"Ignoring empty cells, increment `n` then do the adding."
if x != "?":
i.n += 1
i.add1(x)
return x
def dist(i,x,y):
"distance between two values"
return 1 if x=="?" and y=="?" else i.dist1(x,y)
def sub(i,x):
"Ignoring empty cells, decrement `n` then do the sub-trcting."
if x != "?":
i.n -= 1
i.sub1(x)
return x
#---------------------------------------------
class NUM(COL):
"Summarize stream of numbers. Knows the mean and standard deviation."
def __init__(i, txt="",at=0):
COL.__init__(i,txt=txt,at=at)
i.w = -1 if len(i.txt) > 0 and i.txt[-1] == "-" else 1
i.mu = i.m2 = 0
i.lo, i.hi = big, -big
def add1(i,x):
"Update `lo,hi` and the variables needed to calculate stdev."
i.lo = min(x, i.lo)
i.hi = max(x, i.hi)
delta = x - i.mu
i.mu += delta/i.n
i.m2 += delta*(x - i.mu)
def dist1(i,x,y):
"distance between two values"
if x=="?": y = i.norm(y); x= 0 if y > .5 else 1
elif y=="?": x = i.norm(x); y= 0 if x > .5 else 1
else: x,y = i.norm(x), i.norm(y)
return abs(x - y)
def div(i, decimals=None):
"Return diversity around the central tendency."
return rnd((i.m2/(i.n - 1))**.5 if i.m2>0 and i.n > 1 else 0, decimals)
def mid(i, decimals=None):
"Return central tendency."
return rnd(i.mu, decimals)
def norm(i,x):
"May `x` to 0..1 for `lo` to `hi`."
return x if x=="?" else (x-i.lo)/(i.hi - i.lo + 1/big)
def sub1(i,x):
"decrement count"
if i.n <= 1:
i.n, i.mu, i.m2 = 0, 0, 0
else:
d = x - i.mu
i.mu -= d / i.n
i.m2 -= d * (x - i.mu)
#---------------------------------------------
class SYM(COL):
"Summary a stream of symbols. Knows mode and entropy."
def __init__(i,txt="",at=0):
COL.__init__(i,txt=txt,at=at)
i.counts,i.mode, i.most = {},None,0
def add1(i,x):
"Increment counts and mode."
now = i.counts[x] = 1 + i.counts.get(x,0)
if now > i.most: i.most, i.mode = now, x
def dist1(i,x,y):
"distance between two values"
return 0 if x==y else 1
def div(i, decimals=None):
"Return diversity around the central tendency."
a = i.counts
return rnd( - sum(a[k]/i.n * math.log(a[k]/i.n,2) for k in a if a[k] > 0), decimals)
def mid(i,decimals=None):
"Return central tendency."
return i.mode
def sub1(i,x):
"Decrements counts."
i.counts[x] -= 1
assert 0 <= i.counts[x]
#---------------------------------------------
class COLS(pretty):
"Convert a list of names into NUMs and SYMs (kept different binds of cols in different lists)."
def __init__(i,names):
i.x, i.y, i.names = [],[],names
i.all = [(NUM if s[0].isupper() else SYM)(at=n,txt=s) for n,s in enumerate(names)]
for col in i.all:
z = col.txt[-1]
if z != "X":
if z=="!": i.klass= col
(i.y if z in "-+!" else i.x).append(col)
def add(i,row):
"Add a row's data to all the non-skipped columns."
for cols in [i.x, i.y]:
for col in cols: col.add(row.at(col))
return row
#---------------------------------------------
class DATA(pretty):
"Keep `rows` of data, summarized into col`umns."
def __init__(i,src=[]):
i.cols, i.rows = None,[]
[i.add(row) for row in src]
def add(i,row):
"For first row, build the `cols`. Otherwise, update summaries and the rows."
if i.cols: i.rows += [i.cols.add(row)]
else: i.cols = COLS(row.cells)
def clone(i,rows=[]):
"Replicate structure of self."
return DATA([ROW(i.cols.names)] + rows)
def dist(i,row1,row2):
"distance between two values"
return (sum(c.dist(row1.cells[c.at], row2.cells[c.at]) for c in i.cs.x)**the.p
/ len(i.cols.x))**(1/the.p)
def sort2(i,row1,row2):
"Return True if `row1` better than `row2`."
s1, s2, n = 0, 0, len(i.cols.y)
for col in i.cols.y:
a, b = col.norm(row1.at(col)), col.norm(row2.at(col))
s1 -= math.exp(col.w * (a - b) / n)
s2 -= math.exp(col.w * (b - a) / n)
return s1 / n < s2 / n
def sorted(i,rows=[]):
"Sort all `rows`."
return sorted(rows or i.rows, key=cmp_to_key(lambda a,b: i.sort2(a,b)))
def stats(i,cols=None, fun="mid", decimals=2):
"Report statistics on a set of `col`umns (defaults to `i.cols.y`."
cols = cols or i.cols.y
def what(col): return col.mid(decimals) if fun=="mid" else col.div(decimals)
return BAG(mid=BAG(**{"N+":cols[0].n, **{col.txt:what(col) for col in cols}}))
#---------------------------------------------
# operators, used in trees
ops = {">" : lambda x,y: x=="?" or y=="?" or x>y,
"<=" : lambda x,y: x=="?" or y=="?" or x<=y,
"==" : lambda x,y: x=="?" or y=="?" or x==y,
"!=" : lambda x,y: x=="?" or y=="?" or x!=y}
"""Operators used in decision tree."""
neg = { ">" : "<=",
"<=" : ">",
"==" : "!=",
"!=" : "==" }
"""Negation of operators."""
#---------------------------------------------
# tree generation
class TREE(object):
"Recursively split on the cut (that most distinguishes different klasses)."
def __init__(i,data):
lst = data.sorted()
n = int(len(lst)**the.min)
bests = lst[-n:]
rests = lst[:n*the.rest]
for row in bests: row.klass = True
for row in rests: row.klass = False
i.data = data
top = i.data.clone(bests+rests)
i.stop = 2*len(top.rows)**the.min
i.root = i.grow(top.rows, BAG(here=top,at=None))
def grow(i,rows, t):
t.left,t.right = None,None
if len(rows) >= i.stop:
_,at,op,cut,s = i.cut(i.data, i.data.cols.x, rows)
if cut:
left,right = [],[]
[(left if ops[op](row.cells[at], cut) else right).append(row) for row in rows]
if len(left) != len(rows) and len(right) != len(rows):
t.left = i.grow(left, BAG(here=i.data.clone(left), at=at,cut=cut,txt=s,op=op))
t.right= i.grow(right, BAG(here=i.data.clone(right),at=at,cut=cut,txt=s,op=neg[op]))
return t
def cut(i,data,cols,rows):
"Return best `div,at,op,cut,txt` that most divides the klasses in `rows`."
def sym(col):
"For syms, just return the one with least diversity of klasses."
d = {}
for row in rows:
x = row.at(col)
if x != "?":
d[x] = d.get(x, None) or SYM()
d[x].cut = x
d[x].add(row.klass)
best = sorted(d.values(), key=lambda s:s.div())[0]
return best.div(), col.at,"==",best.cut if best.n < col.n else None,col.txt
#-----------
def num(col):
"For nums, just return the cut that most reduces expected diversity."
X = lambda row: row.at(col)
eps = col.div()*the.cohen
small = len(rows)**the.min
all = sorted([row for row in rows if X(row) != "?"], key=X)
d1,d2,a,z = SYM(), SYM(), X(all[0]), X(all[-1])
[d2.add(row.klass) for row in all]
cut,n2,lo = None,len(all),d2.div()
for n1,row in enumerate(all):
n2, x, y = n2-1, X(row), row.klass
d2.sub( d1.add(y) )
if n1 > small and n2 > small and x != X(all[n1+1]) and x-a > eps and z-x > eps:
xpect = (d1.div()*n1 + d2.div()*n2)/(n1+n2)
if xpect < lo:
cut,lo = x,xpect
return lo,col.at,"<=",cut,col.txt
#------------------------------------
return sorted([(num(col) if isa(col,NUM) else sym(col)) for col in cols])[0]
def show(i):
for lvl,t,isLeaf in i.nodes():
if lvl==0:
print("")
print(t.here.stats().mid,end="")
else:
print("\n"+("|.. " * lvl)+ f"{t.txt} {t.op} {t.cut} ",end="")
if isLeaf: print(t.here.stats().mid,end="")
print("")
def nodes(i,t=None,lvl=0):
t = t or i.root
yield lvl,t,t.left==None
for t1 in [t.left,t.right]:
if t1:
for lvl2,t2,isLeaf in i.nodes(t1,lvl+1): yield lvl2,t2, isLeaf
#---------------------------------------------
R = random.random # short cut to random number generator
isa = isinstance # short cut for checking types
big = 1E30
"""large numbers"""
class BAG(dict):
"Dictionaries that can be accessed via `x[\"slot\"]` or `x.slot`."
__getattr__ = dict.get
class SETTINGS(BAG):
"Parse settings from string."
def __init__(i,s):
for m in re.finditer(r"\n\s*-\w+\s*--(\w+)[^=]*=\s*(\S+)",s): i[m[1]] = coerce(m[2])
def cli(i):
"""For k,v in i, update `v` if there is a command-line flag `-k[0]` or `--k`. Note
that bookean values need no argument (since we will just negate `v`)."""
for k,v in i.items():
v = str(v)
for j,x in enumerate(sys.argv):
if ("-"+k[0]) == x or ("--"+k) == x:
v = "True" if v=="False" else ("False" if v=="True" else sys.argv[j+1])
i[k] = coerce(v)
def showHelp(i,s):
"pretty print help string"
def bold(m): return colored(m[0], attrs=["bold"])
def bright(m): return colored(m[0], "light_yellow")
def pretty(s): return re.sub("\n[A-Z][A-Z]+:", bold, re.sub(" [-][-]?[\S]+", bright, s))
print(pretty(s))
print(pretty("\nACTIONS:"))
[print(pretty(f" -g {k:8} {f.__doc__}")) for k,f in Egs.all.items() if k[0].isupper()]
def coerce(x):
try: return literal_eval(x)
except: return x
def csv(file, filter=ROW):
"Returns an iterator that returns lists from standard input (-) or a file."
if file=="-": file=None
with file_or_stdin(file) as src:
for line in src:
line = re.sub(r'([\n\t\r"\' ]|#.*)', '', line)
if line:
yield filter([coerce(s.strip()) for s in line.split(",")])
def rnd(x,decimals=None):
return x if decimals==None else round(x,decimals)
def yell(s,c):
"Print string `s`, colored by `c`."
print(colored(s,"light_"+c,attrs=["bold"]),end="")
#----------------------------------------------------
class Egs:
"Place to store the examples."
all = locals()
csv = [ "pom.csv",
"nasa93dem.csv",
"healthCloseIsses12mths0011-easy.csv",
"healthCloseIsses12mths0001-hard.csv",
"coc10000.csv",
"coc1000.csv",
"china.csv",
"auto93.csv",
"auto2.csv",
"SSN.csv",
"SSM.csv"]
def ok():
"Run everything (except ok,h). Return how often something fails."
fails, saved = 0, {k:v for k,v in the.items()}
for what,fun in Egs.all.items():
if what[0].isupper():
yell(what + " ","yellow")
fail = Egs.failure(saved,fun)
yell(" FAIL\n","red") if fail else yell(" PASS\n","green")
fails += fail
yell(f"TOTAL FAILURE(s) = {fails}\n", "red" if fails > 0 else "cyan")
sys.exit(fails)
def failure(saved,fun):
"""Called by `Egs.ok`. `Fun` fails if it returns `False` or if it crashes.
If it crashes, print the stack dump but then continue on
Before running it, reset the system to initial conditions."""
for k,v in saved.items(): the[k] = v
random.seed(the.seed)
fail = False
try: fail = fun() == False # here, fail might be reset to True
except: fail = True; traceback.print_exc()
return fail
def The():
"print the settings"
print(the)
def Rnd():
"rnd to 2 decimals"
return 3.14 == rnd(math.pi,2)
def Num(txt=""):
"test NUMs"
n = NUM(txt)
for x in range(10**4): n.add(R()**.5)
return .66 < n.mid() < .67 and .23 < n.div() < .24 and n
def Sym(txt=""):
"test SYMs"
s=SYM(txt)
[s.add(x) for x in "aaaabbc"]
return "a"==s.mid() and 1.37 <= s.div() < 1.38 and s
def Rows():
"Check we can load rows from file."
print(the.file)
for row in list(csv(the.file))[:5]: print(row) #[:5]: print(row)
def Col():
"Check we can convert names to NUMs and SYMs."
[print(x) for x in COLS(["name","Age","Weight-"]).all]
def Data():
"Can we load data and get its stats?"
DATA(csv(the.file)).stats()
def Clone():
"Can we replicate a DATA's structure?"
d1 = DATA(csv(the.file))
d2= d1.clone(d1.rows)
print(d1.cols.y[1])
print(d2.cols.y[1])
def Sorts():
"Can we sort rows into `best` and `rest`?"
d = DATA(csv(the.file))
lst = d.sorted()
m = int(len(lst)**.5)
best= d.clone(lst[-m:]); print("all ",d.stats())
best= d.clone(lst[-m:]); print("best",best.stats())
rest= d.clone(lst[:m*the.rest]); print("rest",rest.stats())
def Nodes():
"Does the TREE iterator work?"
t1 = TREE(DATA(csv(the.file)))
for lvl,t2,isLeaf in t1.nodes():
print("|.. " * lvl,isLeaf)
def Tree():
"Does the TREE pretty print work?"
TREE( DATA(csv(the.file)) ).show()
def Trees():
"Test all tree generation of all csv files."
for f in Egs.csv:
print("\n\n-----------",f)
TREE( DATA(csv(f"../data/{f}")) ).show()
#---------------------------------------------
the = SETTINGS(__doc__)
"""Config options, parsed from `__doc__`"""
random.seed(the.seed) # set random number seed
if __name__ == "__main__":
the.cli()
if the.help: the.showHelp(__doc__)
elif the.go in Egs.all and callable(Egs.all[the.go]):
Egs.all[the.go]()
Global variables
var ops
-
Operators used in decision tree.
var neg
-
Negation of operators.
var big
-
large numbers
var the
-
Config options, parsed from
__doc__
Functions
def R()
-
random() -> x in the interval [0, 1).
def coerce(x)
-
Expand source code
def coerce(x): try: return literal_eval(x) except: return x
def csv(file, filter=fish.ROW)
-
Returns an iterator that returns lists from standard input (-) or a file.
Expand source code
def csv(file, filter=ROW): "Returns an iterator that returns lists from standard input (-) or a file." if file=="-": file=None with file_or_stdin(file) as src: for line in src: line = re.sub(r'([\n\t\r"\' ]|#.*)', '', line) if line: yield filter([coerce(s.strip()) for s in line.split(",")])
def rnd(x, decimals=None)
-
Expand source code
def rnd(x,decimals=None): return x if decimals==None else round(x,decimals)
def yell(s, c)
-
Print string
s
, colored byc
.Expand source code
def yell(s,c): "Print string `s`, colored by `c`." print(colored(s,"light_"+c,attrs=["bold"]),end="")
Classes
class pretty
-
Objects support pretty print, hiding privates slots (those starting with
_
)Expand source code
class pretty(object): "Objects support pretty print, hiding privates slots (those starting with `_`)" def __repr__(i): return i.__class__.__name__+str({k:v for k,v in i.__dict__.items() if k[0] != "_"})
Subclasses
class ROW (cells=[])
-
Place to store cells and meta-knowledge about those cells.
Expand source code
class ROW(pretty): "Place to store cells and meta-knowledge about those cells." def __init__(i, cells=[]): i.cells,i.klass = cells,None def at(i,col): return i.cells[col.at]
Ancestors
Methods
def at(i, col)
-
Expand source code
def at(i,col): return i.cells[col.at]
class COL (txt='', at=0)
-
COLumns know the name, position and the count of rows seen.
Expand source code
class COL(pretty): "COLumns know the name, position and the count of rows seen." def __init__(i, txt="",at=0): i.n,i.at,i.txt = 0,at,txt def add(i,x): "Ignoring empty cells, increment `n` then do the adding." if x != "?": i.n += 1 i.add1(x) return x def dist(i,x,y): "distance between two values" return 1 if x=="?" and y=="?" else i.dist1(x,y) def sub(i,x): "Ignoring empty cells, decrement `n` then do the sub-trcting." if x != "?": i.n -= 1 i.sub1(x) return x
Ancestors
Subclasses
Methods
def add(i, x)
-
Ignoring empty cells, increment
n
then do the adding.Expand source code
def add(i,x): "Ignoring empty cells, increment `n` then do the adding." if x != "?": i.n += 1 i.add1(x) return x
def dist(i, x, y)
-
distance between two values
Expand source code
def dist(i,x,y): "distance between two values" return 1 if x=="?" and y=="?" else i.dist1(x,y)
def sub(i, x)
-
Ignoring empty cells, decrement
n
then do the sub-trcting.Expand source code
def sub(i,x): "Ignoring empty cells, decrement `n` then do the sub-trcting." if x != "?": i.n -= 1 i.sub1(x) return x
class NUM (txt='', at=0)
-
Summarize stream of numbers. Knows the mean and standard deviation.
Expand source code
class NUM(COL): "Summarize stream of numbers. Knows the mean and standard deviation." def __init__(i, txt="",at=0): COL.__init__(i,txt=txt,at=at) i.w = -1 if len(i.txt) > 0 and i.txt[-1] == "-" else 1 i.mu = i.m2 = 0 i.lo, i.hi = big, -big def add1(i,x): "Update `lo,hi` and the variables needed to calculate stdev." i.lo = min(x, i.lo) i.hi = max(x, i.hi) delta = x - i.mu i.mu += delta/i.n i.m2 += delta*(x - i.mu) def dist1(i,x,y): "distance between two values" if x=="?": y = i.norm(y); x= 0 if y > .5 else 1 elif y=="?": x = i.norm(x); y= 0 if x > .5 else 1 else: x,y = i.norm(x), i.norm(y) return abs(x - y) def div(i, decimals=None): "Return diversity around the central tendency." return rnd((i.m2/(i.n - 1))**.5 if i.m2>0 and i.n > 1 else 0, decimals) def mid(i, decimals=None): "Return central tendency." return rnd(i.mu, decimals) def norm(i,x): "May `x` to 0..1 for `lo` to `hi`." return x if x=="?" else (x-i.lo)/(i.hi - i.lo + 1/big) def sub1(i,x): "decrement count" if i.n <= 1: i.n, i.mu, i.m2 = 0, 0, 0 else: d = x - i.mu i.mu -= d / i.n i.m2 -= d * (x - i.mu)
Ancestors
Methods
def add1(i, x)
-
Update
lo,hi
and the variables needed to calculate stdev.Expand source code
def add1(i,x): "Update `lo,hi` and the variables needed to calculate stdev." i.lo = min(x, i.lo) i.hi = max(x, i.hi) delta = x - i.mu i.mu += delta/i.n i.m2 += delta*(x - i.mu)
def dist1(i, x, y)
-
distance between two values
Expand source code
def dist1(i,x,y): "distance between two values" if x=="?": y = i.norm(y); x= 0 if y > .5 else 1 elif y=="?": x = i.norm(x); y= 0 if x > .5 else 1 else: x,y = i.norm(x), i.norm(y) return abs(x - y)
def div(i, decimals=None)
-
Return diversity around the central tendency.
Expand source code
def div(i, decimals=None): "Return diversity around the central tendency." return rnd((i.m2/(i.n - 1))**.5 if i.m2>0 and i.n > 1 else 0, decimals)
def mid(i, decimals=None)
-
Return central tendency.
Expand source code
def mid(i, decimals=None): "Return central tendency." return rnd(i.mu, decimals)
def norm(i, x)
-
May
x
to 0..1 forlo
tohi
.Expand source code
def norm(i,x): "May `x` to 0..1 for `lo` to `hi`." return x if x=="?" else (x-i.lo)/(i.hi - i.lo + 1/big)
def sub1(i, x)
-
decrement count
Expand source code
def sub1(i,x): "decrement count" if i.n <= 1: i.n, i.mu, i.m2 = 0, 0, 0 else: d = x - i.mu i.mu -= d / i.n i.m2 -= d * (x - i.mu)
Inherited members
class SYM (txt='', at=0)
-
Summary a stream of symbols. Knows mode and entropy.
Expand source code
class SYM(COL): "Summary a stream of symbols. Knows mode and entropy." def __init__(i,txt="",at=0): COL.__init__(i,txt=txt,at=at) i.counts,i.mode, i.most = {},None,0 def add1(i,x): "Increment counts and mode." now = i.counts[x] = 1 + i.counts.get(x,0) if now > i.most: i.most, i.mode = now, x def dist1(i,x,y): "distance between two values" return 0 if x==y else 1 def div(i, decimals=None): "Return diversity around the central tendency." a = i.counts return rnd( - sum(a[k]/i.n * math.log(a[k]/i.n,2) for k in a if a[k] > 0), decimals) def mid(i,decimals=None): "Return central tendency." return i.mode def sub1(i,x): "Decrements counts." i.counts[x] -= 1 assert 0 <= i.counts[x]
Ancestors
Methods
def add1(i, x)
-
Increment counts and mode.
Expand source code
def add1(i,x): "Increment counts and mode." now = i.counts[x] = 1 + i.counts.get(x,0) if now > i.most: i.most, i.mode = now, x
def dist1(i, x, y)
-
distance between two values
Expand source code
def dist1(i,x,y): "distance between two values" return 0 if x==y else 1
def div(i, decimals=None)
-
Return diversity around the central tendency.
Expand source code
def div(i, decimals=None): "Return diversity around the central tendency." a = i.counts return rnd( - sum(a[k]/i.n * math.log(a[k]/i.n,2) for k in a if a[k] > 0), decimals)
def mid(i, decimals=None)
-
Return central tendency.
Expand source code
def mid(i,decimals=None): "Return central tendency." return i.mode
def sub1(i, x)
-
Decrements counts.
Expand source code
def sub1(i,x): "Decrements counts." i.counts[x] -= 1 assert 0 <= i.counts[x]
Inherited members
class COLS (names)
-
Convert a list of names into NUMs and SYMs (kept different binds of cols in different lists).
Expand source code
class COLS(pretty): "Convert a list of names into NUMs and SYMs (kept different binds of cols in different lists)." def __init__(i,names): i.x, i.y, i.names = [],[],names i.all = [(NUM if s[0].isupper() else SYM)(at=n,txt=s) for n,s in enumerate(names)] for col in i.all: z = col.txt[-1] if z != "X": if z=="!": i.klass= col (i.y if z in "-+!" else i.x).append(col) def add(i,row): "Add a row's data to all the non-skipped columns." for cols in [i.x, i.y]: for col in cols: col.add(row.at(col)) return row
Ancestors
Methods
def add(i, row)
-
Add a row's data to all the non-skipped columns.
Expand source code
def add(i,row): "Add a row's data to all the non-skipped columns." for cols in [i.x, i.y]: for col in cols: col.add(row.at(col)) return row
class DATA (src=[])
-
Keep
rows
of data, summarized into col`umns.Expand source code
class DATA(pretty): "Keep `rows` of data, summarized into col`umns." def __init__(i,src=[]): i.cols, i.rows = None,[] [i.add(row) for row in src] def add(i,row): "For first row, build the `cols`. Otherwise, update summaries and the rows." if i.cols: i.rows += [i.cols.add(row)] else: i.cols = COLS(row.cells) def clone(i,rows=[]): "Replicate structure of self." return DATA([ROW(i.cols.names)] + rows) def dist(i,row1,row2): "distance between two values" return (sum(c.dist(row1.cells[c.at], row2.cells[c.at]) for c in i.cs.x)**the.p / len(i.cols.x))**(1/the.p) def sort2(i,row1,row2): "Return True if `row1` better than `row2`." s1, s2, n = 0, 0, len(i.cols.y) for col in i.cols.y: a, b = col.norm(row1.at(col)), col.norm(row2.at(col)) s1 -= math.exp(col.w * (a - b) / n) s2 -= math.exp(col.w * (b - a) / n) return s1 / n < s2 / n def sorted(i,rows=[]): "Sort all `rows`." return sorted(rows or i.rows, key=cmp_to_key(lambda a,b: i.sort2(a,b))) def stats(i,cols=None, fun="mid", decimals=2): "Report statistics on a set of `col`umns (defaults to `i.cols.y`." cols = cols or i.cols.y def what(col): return col.mid(decimals) if fun=="mid" else col.div(decimals) return BAG(mid=BAG(**{"N+":cols[0].n, **{col.txt:what(col) for col in cols}}))
Ancestors
Methods
def add(i, row)
-
For first row, build the
cols
. Otherwise, update summaries and the rows.Expand source code
def add(i,row): "For first row, build the `cols`. Otherwise, update summaries and the rows." if i.cols: i.rows += [i.cols.add(row)] else: i.cols = COLS(row.cells)
def clone(i, rows=[])
-
Replicate structure of self.
Expand source code
def clone(i,rows=[]): "Replicate structure of self." return DATA([ROW(i.cols.names)] + rows)
def dist(i, row1, row2)
-
distance between two values
Expand source code
def dist(i,row1,row2): "distance between two values" return (sum(c.dist(row1.cells[c.at], row2.cells[c.at]) for c in i.cs.x)**the.p / len(i.cols.x))**(1/the.p)
def sort2(i, row1, row2)
-
Return True if
row1
better thanrow2
.Expand source code
def sort2(i,row1,row2): "Return True if `row1` better than `row2`." s1, s2, n = 0, 0, len(i.cols.y) for col in i.cols.y: a, b = col.norm(row1.at(col)), col.norm(row2.at(col)) s1 -= math.exp(col.w * (a - b) / n) s2 -= math.exp(col.w * (b - a) / n) return s1 / n < s2 / n
def sorted(i, rows=[])
-
Sort all
rows
.Expand source code
def sorted(i,rows=[]): "Sort all `rows`." return sorted(rows or i.rows, key=cmp_to_key(lambda a,b: i.sort2(a,b)))
def stats(i, cols=None, fun='mid', decimals=2)
-
Report statistics on a set of
col
umns (defaults toi.cols.y
.Expand source code
def stats(i,cols=None, fun="mid", decimals=2): "Report statistics on a set of `col`umns (defaults to `i.cols.y`." cols = cols or i.cols.y def what(col): return col.mid(decimals) if fun=="mid" else col.div(decimals) return BAG(mid=BAG(**{"N+":cols[0].n, **{col.txt:what(col) for col in cols}}))
class TREE (data)
-
Recursively split on the cut (that most distinguishes different klasses).
Expand source code
class TREE(object): "Recursively split on the cut (that most distinguishes different klasses)." def __init__(i,data): lst = data.sorted() n = int(len(lst)**the.min) bests = lst[-n:] rests = lst[:n*the.rest] for row in bests: row.klass = True for row in rests: row.klass = False i.data = data top = i.data.clone(bests+rests) i.stop = 2*len(top.rows)**the.min i.root = i.grow(top.rows, BAG(here=top,at=None)) def grow(i,rows, t): t.left,t.right = None,None if len(rows) >= i.stop: _,at,op,cut,s = i.cut(i.data, i.data.cols.x, rows) if cut: left,right = [],[] [(left if ops[op](row.cells[at], cut) else right).append(row) for row in rows] if len(left) != len(rows) and len(right) != len(rows): t.left = i.grow(left, BAG(here=i.data.clone(left), at=at,cut=cut,txt=s,op=op)) t.right= i.grow(right, BAG(here=i.data.clone(right),at=at,cut=cut,txt=s,op=neg[op])) return t def cut(i,data,cols,rows): "Return best `div,at,op,cut,txt` that most divides the klasses in `rows`." def sym(col): "For syms, just return the one with least diversity of klasses." d = {} for row in rows: x = row.at(col) if x != "?": d[x] = d.get(x, None) or SYM() d[x].cut = x d[x].add(row.klass) best = sorted(d.values(), key=lambda s:s.div())[0] return best.div(), col.at,"==",best.cut if best.n < col.n else None,col.txt #----------- def num(col): "For nums, just return the cut that most reduces expected diversity." X = lambda row: row.at(col) eps = col.div()*the.cohen small = len(rows)**the.min all = sorted([row for row in rows if X(row) != "?"], key=X) d1,d2,a,z = SYM(), SYM(), X(all[0]), X(all[-1]) [d2.add(row.klass) for row in all] cut,n2,lo = None,len(all),d2.div() for n1,row in enumerate(all): n2, x, y = n2-1, X(row), row.klass d2.sub( d1.add(y) ) if n1 > small and n2 > small and x != X(all[n1+1]) and x-a > eps and z-x > eps: xpect = (d1.div()*n1 + d2.div()*n2)/(n1+n2) if xpect < lo: cut,lo = x,xpect return lo,col.at,"<=",cut,col.txt #------------------------------------ return sorted([(num(col) if isa(col,NUM) else sym(col)) for col in cols])[0] def show(i): for lvl,t,isLeaf in i.nodes(): if lvl==0: print("") print(t.here.stats().mid,end="") else: print("\n"+("|.. " * lvl)+ f"{t.txt} {t.op} {t.cut} ",end="") if isLeaf: print(t.here.stats().mid,end="") print("") def nodes(i,t=None,lvl=0): t = t or i.root yield lvl,t,t.left==None for t1 in [t.left,t.right]: if t1: for lvl2,t2,isLeaf in i.nodes(t1,lvl+1): yield lvl2,t2, isLeaf
Methods
def grow(i, rows, t)
-
Expand source code
def grow(i,rows, t): t.left,t.right = None,None if len(rows) >= i.stop: _,at,op,cut,s = i.cut(i.data, i.data.cols.x, rows) if cut: left,right = [],[] [(left if ops[op](row.cells[at], cut) else right).append(row) for row in rows] if len(left) != len(rows) and len(right) != len(rows): t.left = i.grow(left, BAG(here=i.data.clone(left), at=at,cut=cut,txt=s,op=op)) t.right= i.grow(right, BAG(here=i.data.clone(right),at=at,cut=cut,txt=s,op=neg[op])) return t
def cut(i, data, cols, rows)
-
Return best
div,at,op,cut,txt
that most divides the klasses inrows
.Expand source code
def cut(i,data,cols,rows): "Return best `div,at,op,cut,txt` that most divides the klasses in `rows`." def sym(col): "For syms, just return the one with least diversity of klasses." d = {} for row in rows: x = row.at(col) if x != "?": d[x] = d.get(x, None) or SYM() d[x].cut = x d[x].add(row.klass) best = sorted(d.values(), key=lambda s:s.div())[0] return best.div(), col.at,"==",best.cut if best.n < col.n else None,col.txt #----------- def num(col): "For nums, just return the cut that most reduces expected diversity." X = lambda row: row.at(col) eps = col.div()*the.cohen small = len(rows)**the.min all = sorted([row for row in rows if X(row) != "?"], key=X) d1,d2,a,z = SYM(), SYM(), X(all[0]), X(all[-1]) [d2.add(row.klass) for row in all] cut,n2,lo = None,len(all),d2.div() for n1,row in enumerate(all): n2, x, y = n2-1, X(row), row.klass d2.sub( d1.add(y) ) if n1 > small and n2 > small and x != X(all[n1+1]) and x-a > eps and z-x > eps: xpect = (d1.div()*n1 + d2.div()*n2)/(n1+n2) if xpect < lo: cut,lo = x,xpect return lo,col.at,"<=",cut,col.txt #------------------------------------ return sorted([(num(col) if isa(col,NUM) else sym(col)) for col in cols])[0]
def show(i)
-
Expand source code
def show(i): for lvl,t,isLeaf in i.nodes(): if lvl==0: print("") print(t.here.stats().mid,end="") else: print("\n"+("|.. " * lvl)+ f"{t.txt} {t.op} {t.cut} ",end="") if isLeaf: print(t.here.stats().mid,end="") print("")
def nodes(i, t=None, lvl=0)
-
Expand source code
def nodes(i,t=None,lvl=0): t = t or i.root yield lvl,t,t.left==None for t1 in [t.left,t.right]: if t1: for lvl2,t2,isLeaf in i.nodes(t1,lvl+1): yield lvl2,t2, isLeaf
class BAG (*args, **kwargs)
-
Dictionaries that can be accessed via
x["slot"]
orx.slot
.Expand source code
class BAG(dict): "Dictionaries that can be accessed via `x[\"slot\"]` or `x.slot`." __getattr__ = dict.get
Ancestors
- builtins.dict
Subclasses
class SETTINGS (s)
-
Parse settings from string.
Expand source code
class SETTINGS(BAG): "Parse settings from string." def __init__(i,s): for m in re.finditer(r"\n\s*-\w+\s*--(\w+)[^=]*=\s*(\S+)",s): i[m[1]] = coerce(m[2]) def cli(i): """For k,v in i, update `v` if there is a command-line flag `-k[0]` or `--k`. Note that bookean values need no argument (since we will just negate `v`).""" for k,v in i.items(): v = str(v) for j,x in enumerate(sys.argv): if ("-"+k[0]) == x or ("--"+k) == x: v = "True" if v=="False" else ("False" if v=="True" else sys.argv[j+1]) i[k] = coerce(v) def showHelp(i,s): "pretty print help string" def bold(m): return colored(m[0], attrs=["bold"]) def bright(m): return colored(m[0], "light_yellow") def pretty(s): return re.sub("\n[A-Z][A-Z]+:", bold, re.sub(" [-][-]?[\S]+", bright, s)) print(pretty(s)) print(pretty("\nACTIONS:")) [print(pretty(f" -g {k:8} {f.__doc__}")) for k,f in Egs.all.items() if k[0].isupper()]
Ancestors
- BAG
- builtins.dict
Methods
def cli(i)
-
For k,v in i, update
v
if there is a command-line flag-k[0]
or--k
. Note that bookean values need no argument (since we will just negatev
).Expand source code
def cli(i): """For k,v in i, update `v` if there is a command-line flag `-k[0]` or `--k`. Note that bookean values need no argument (since we will just negate `v`).""" for k,v in i.items(): v = str(v) for j,x in enumerate(sys.argv): if ("-"+k[0]) == x or ("--"+k) == x: v = "True" if v=="False" else ("False" if v=="True" else sys.argv[j+1]) i[k] = coerce(v)
def showHelp(i, s)
-
pretty print help string
Expand source code
def showHelp(i,s): "pretty print help string" def bold(m): return colored(m[0], attrs=["bold"]) def bright(m): return colored(m[0], "light_yellow") def pretty(s): return re.sub("\n[A-Z][A-Z]+:", bold, re.sub(" [-][-]?[\S]+", bright, s)) print(pretty(s)) print(pretty("\nACTIONS:")) [print(pretty(f" -g {k:8} {f.__doc__}")) for k,f in Egs.all.items() if k[0].isupper()]
class Egs
-
Place to store the examples.
Expand source code
class Egs: "Place to store the examples." all = locals() csv = [ "pom.csv", "nasa93dem.csv", "healthCloseIsses12mths0011-easy.csv", "healthCloseIsses12mths0001-hard.csv", "coc10000.csv", "coc1000.csv", "china.csv", "auto93.csv", "auto2.csv", "SSN.csv", "SSM.csv"] def ok(): "Run everything (except ok,h). Return how often something fails." fails, saved = 0, {k:v for k,v in the.items()} for what,fun in Egs.all.items(): if what[0].isupper(): yell(what + " ","yellow") fail = Egs.failure(saved,fun) yell(" FAIL\n","red") if fail else yell(" PASS\n","green") fails += fail yell(f"TOTAL FAILURE(s) = {fails}\n", "red" if fails > 0 else "cyan") sys.exit(fails) def failure(saved,fun): """Called by `Egs.ok`. `Fun` fails if it returns `False` or if it crashes. If it crashes, print the stack dump but then continue on Before running it, reset the system to initial conditions.""" for k,v in saved.items(): the[k] = v random.seed(the.seed) fail = False try: fail = fun() == False # here, fail might be reset to True except: fail = True; traceback.print_exc() return fail def The(): "print the settings" print(the) def Rnd(): "rnd to 2 decimals" return 3.14 == rnd(math.pi,2) def Num(txt=""): "test NUMs" n = NUM(txt) for x in range(10**4): n.add(R()**.5) return .66 < n.mid() < .67 and .23 < n.div() < .24 and n def Sym(txt=""): "test SYMs" s=SYM(txt) [s.add(x) for x in "aaaabbc"] return "a"==s.mid() and 1.37 <= s.div() < 1.38 and s def Rows(): "Check we can load rows from file." print(the.file) for row in list(csv(the.file))[:5]: print(row) #[:5]: print(row) def Col(): "Check we can convert names to NUMs and SYMs." [print(x) for x in COLS(["name","Age","Weight-"]).all] def Data(): "Can we load data and get its stats?" DATA(csv(the.file)).stats() def Clone(): "Can we replicate a DATA's structure?" d1 = DATA(csv(the.file)) d2= d1.clone(d1.rows) print(d1.cols.y[1]) print(d2.cols.y[1]) def Sorts(): "Can we sort rows into `best` and `rest`?" d = DATA(csv(the.file)) lst = d.sorted() m = int(len(lst)**.5) best= d.clone(lst[-m:]); print("all ",d.stats()) best= d.clone(lst[-m:]); print("best",best.stats()) rest= d.clone(lst[:m*the.rest]); print("rest",rest.stats()) def Nodes(): "Does the TREE iterator work?" t1 = TREE(DATA(csv(the.file))) for lvl,t2,isLeaf in t1.nodes(): print("|.. " * lvl,isLeaf) def Tree(): "Does the TREE pretty print work?" TREE( DATA(csv(the.file)) ).show() def Trees(): "Test all tree generation of all csv files." for f in Egs.csv: print("\n\n-----------",f) TREE( DATA(csv(f"../data/{f}")) ).show()
Class variables
var all
var csv
Methods
def ok()
-
Run everything (except ok,h). Return how often something fails.
Expand source code
def ok(): "Run everything (except ok,h). Return how often something fails." fails, saved = 0, {k:v for k,v in the.items()} for what,fun in Egs.all.items(): if what[0].isupper(): yell(what + " ","yellow") fail = Egs.failure(saved,fun) yell(" FAIL\n","red") if fail else yell(" PASS\n","green") fails += fail yell(f"TOTAL FAILURE(s) = {fails}\n", "red" if fails > 0 else "cyan") sys.exit(fails)
def failure(saved, fun)
-
Called by
Egs.ok()
.Fun
fails if it returnsFalse
or if it crashes. If it crashes, print the stack dump but then continue on Before running it, reset the system to initial conditions.Expand source code
def failure(saved,fun): """Called by `Egs.ok`. `Fun` fails if it returns `False` or if it crashes. If it crashes, print the stack dump but then continue on Before running it, reset the system to initial conditions.""" for k,v in saved.items(): the[k] = v random.seed(the.seed) fail = False try: fail = fun() == False # here, fail might be reset to True except: fail = True; traceback.print_exc() return fail
def The()
-
print the settings
Expand source code
def The(): "print the settings" print(the)
def Rnd()
-
rnd to 2 decimals
Expand source code
def Rnd(): "rnd to 2 decimals" return 3.14 == rnd(math.pi,2)
def Num(txt='')
-
test NUMs
Expand source code
def Num(txt=""): "test NUMs" n = NUM(txt) for x in range(10**4): n.add(R()**.5) return .66 < n.mid() < .67 and .23 < n.div() < .24 and n
def Sym(txt='')
-
test SYMs
Expand source code
def Sym(txt=""): "test SYMs" s=SYM(txt) [s.add(x) for x in "aaaabbc"] return "a"==s.mid() and 1.37 <= s.div() < 1.38 and s
def Rows()
-
Check we can load rows from file.
Expand source code
def Rows(): "Check we can load rows from file." print(the.file) for row in list(csv(the.file))[:5]: print(row) #[:5]: print(row)
def Col()
-
Check we can convert names to NUMs and SYMs.
Expand source code
def Col(): "Check we can convert names to NUMs and SYMs." [print(x) for x in COLS(["name","Age","Weight-"]).all]
def Data()
-
Can we load data and get its stats?
Expand source code
def Data(): "Can we load data and get its stats?" DATA(csv(the.file)).stats()
def Clone()
-
Can we replicate a DATA's structure?
Expand source code
def Clone(): "Can we replicate a DATA's structure?" d1 = DATA(csv(the.file)) d2= d1.clone(d1.rows) print(d1.cols.y[1]) print(d2.cols.y[1])
def Sorts()
-
Can we sort rows into
best
andrest
?Expand source code
def Sorts(): "Can we sort rows into `best` and `rest`?" d = DATA(csv(the.file)) lst = d.sorted() m = int(len(lst)**.5) best= d.clone(lst[-m:]); print("all ",d.stats()) best= d.clone(lst[-m:]); print("best",best.stats()) rest= d.clone(lst[:m*the.rest]); print("rest",rest.stats())
def Nodes()
-
Does the TREE iterator work?
Expand source code
def Nodes(): "Does the TREE iterator work?" t1 = TREE(DATA(csv(the.file))) for lvl,t2,isLeaf in t1.nodes(): print("|.. " * lvl,isLeaf)
def Tree()
-
Does the TREE pretty print work?
Expand source code
def Tree(): "Does the TREE pretty print work?" TREE( DATA(csv(the.file)) ).show()
def Trees()
-
Test all tree generation of all csv files.
Expand source code
def Trees(): "Test all tree generation of all csv files." for f in Egs.csv: print("\n\n-----------",f) TREE( DATA(csv(f"../data/{f}")) ).show()