def union(R, S):
return R + S
def difference(R, S):
return [t for t in R if t not in S]
def intersect(R, S):
return [t for t in R if t in S]
def project(R, p):
return [p(t) for t in R]
def select(R, s):
return [t for t in R if s(t)]
def product(R, S):
return [(t,u) for t in R for u in S]
def aggregate(R, f):
keys = {r[0] for r in R}
return [(key, f([v for (k,v) in R if k == key])) for key in keys]
select
can be used with a predicate to filter a data set.
>>> def red(t): return t == 'tomato'
>>> select(['banana', 'tomato'], red)
['tomato']
>>> X = [('Alice', 22), ('Bob', 19)]
>>> Y = [('Alice', 'F'), ('Bob', 'M')]
>>> product(X,Y)
[(('Alice', 'F'), ('Alice', 22)), (('Alice', 'F'), ('Bob', 19)), (('Bob', 'M'), ('Alice', 22)), (('Bob', 'M'), ('Bob', 19))]
>>> select(product(X,Y), lambda t: t[0][0] == t[1][0])
[(('Alice', 'F'), ('Alice', 22)), (('Bob', 'M'), ('Bob', 19))]
>>> project(select(product(X,Y), lambda t: t[0][0] == t[1][0]), lambda t: (t[0][0], t[0][1], t[1][1]))
[('Alice', 'F', 22), ('Bob', 'M', 19)]
>>> X = [('Alice', 'F', 22), ('Bob', 'M', 19), ('Carl', 'M', 25), ('Eve', 'F', 27)]
>>> project(X, lambda t: (t[1], t[2]))
[('F', 22), ('M', 19), ('M', 25), ('F', 27)]
>>> aggregate(project(X, lambda t: (t[1], t[2])), sum)
[('F', 49), ('M', 44)]
aggregate
function for a particular key.
>>> Y = project(X, lambda t: (t[1], t[2]))
>>> keys = {t[0] for t in Y}
>>> keys
{'F', 'M'}
>>> [v for (k,v) in Y if k == 'F']
[22, 27]
>>> sum([v for (k,v) in Y if k == 'F'])
49
>>> ('F', sum([v for (k,v) in Y if k == 'F']))
('F', 49)
D
containing some voting results broken down by voter, state where the voter participated in voting, and the candidate they chose:
R = sum([1 for (person, state, candidate) in D if state == "Massachusetts" and candidate == "Trump"])
R = aggregate(project(select(D, lambda psc: if psc[1] == "Massachusetts" and psc[2] == "Trump"), lambda psc: 1), sum)
def map(f, R):
return [t for (k,v) in R for t in f(k,v)]
def reduce(f, R):
keys = {k for (k,v) in R}
return [f(k1, [v for (k2,v) in R if k1 == k2]) for k1 in keys]
map
operation applies some function f
to every key-value tuple and produces zero or more new key-value tuples. A reduce
operation collects all values under the same key and performs an aggregate operation f
on those values. Notice that the operation can be applied to any subset of the tuples in any order, so it is often necessary to use an operation that is associated and commutative.
R = [('Alice', ('F', 23)), ('Bob', ('M', 19)), ('Carl', ('M', 22))]
# Projection keeps only gender and age.
# The original key (the name) is discarded.
X = map(lambda k,v: [(v[0], v[1])], R)
# Aggregation by the new key (i.e., gender).
Y = reduce(lambda k,vs: (k, sum(vs)), X)
R = [('Alice', 23), ('Bob', 19), ('Carl', 22)]
X = map(lambda k,v: [((k,v), (k,v))] if v > 20 else [], R) # Selection.
Y = reduce(lambda k,vs: k, X) # Keep same tuples (use tuples as unique keys).
R = [('Alice', 23), ('Bob', 19), ('Carl', 22)]
S = [('Alice', 'F'), ('Bob', 'M'), ('Carl', 'M')]
X = map(lambda k,v: [(k, ('Age', v))], R)\
+ map(lambda k,v: [(k, ('Gender', v))], S)
Y = reduce(\
lambda k,vs:\
(k,(vs[0][1], vs[1][1]) if vs[0][0] == 'Age' else (vs[1][1],vs[0][1])),\
X\
)
INPUT = [('Alice', ('F', 19)),\
('Bob', ('M', 23)),\
('Carl', ('M', 20)),\
('Eve', ('F', 27))]
TEMP = map(lambda k,v: [(v[0], v[1])], INPUT)
OUTPUT = reduce(lambda k,vs: (k, sum(vs)), TEMP)
db.INPUT.insert({_id:"Alice", gender:"F", age:19});
db.INPUT.insert({_id:"Bob", gender:"M", age:23});
db.INPUT.insert({_id:"Carl", gender:"M", age:20});
db.INPUT.insert({_id:"Eve", gender:"F", age:27});
db.INPUT.mapReduce(
function() {
emit(this.gender, {age:this.age});
},
function(k, vs) {
var total = 0;
for (var i = 0; i < vs.length; i++)
total += vs[i].age;
return {age:total};
},
{out: "OUTPUT"}
);
NCA = [('Alice', 'Boston', 23), ('Bob', 'Boston', 19), ('Carl', 'Seattle', 25)]
CA = [(c,a) for (n,c,a) in NCA]
MIN = aggregate(CA, min)
MIN_NEG = [(c,-1*a) for (c,a) in MIN]
MAX = aggregate(CA, max)
RESULT = aggregate(union(MIN_NEG, MAX), sum)
('Alice', ('Boston', 23))
, in the mapping stage we might estimate the range as ('Boston', (23, 23, 0))
where the second and third entries are the minimum and maximum "known so far" given the limited information (a single data point). Then, in the reduce stage, we would combine these estimates.
NCA = [('Alice', ('Boston', 23)), ('Bob', ('Boston', 19)), ('Carl', ('Seattle', 25))]
I = map(lambda k, v: [(v[0], (v[1], v[1], 0))], NCA)
def reducer(k, vs):
age_lo = min([l for (l,h,r) in vs])
age_hi = max([h for (l,h,r) in vs])
age_ran = age_hi - age_lo
return (k, (age_lo, age_hi, age_ran))
RESULT = reduce(reducer, I)
D = [('Commonwealth Ave./Mass Ave.', '2016-11-02', '11:34:00', 3), ...]
M = project(D, lambda idtc: [(idtc[0], idtc[3])])
R = aggregate(M, max)
H = [(i,d,t) for ((i,m), (j,d,t,c)) in product(R, D) if i==j and m==c]
def dist(p, q):
(x1,y1) = p
(x2,y2) = q
return (x1-x2)**2 + (y1-y2)**2
def plus(args):
p = [0,0]
for (x,y) in args:
p[0] += x
p[1] += y
return tuple(p)
def scale(p, c):
(x,y) = p
return (x/c, y/c)
M = [(13,1), (2,12)]
P = [(1,2),(4,5),(1,3),(10,12),(13,14),(13,9),(11,11)]
OLD = []
while OLD != M:
OLD = M
MPD = [(m, p, dist(m,p)) for (m, p) in product(M, P)]
PDs = [(p, dist(m,p)) for (m, p, d) in MPD]
PD = aggregate(PDs, min)
MP = [(m, p) for ((m,p,d), (p2,d2)) in product(MPD, PD) if p==p2 and d==d2]
MT = aggregate(MP, plus)
M1 = [(m, 1) for (m, _) in MP]
MC = aggregate(M1, sum)
M = [scale(t,c) for ((m,t),(m2,c)) in product(MT, MC) if m == m2]
print(sorted(M))
[(m, p) for ((m,p,d), (p2,d2)) in product(MPD, PD) if p==p2 and d==d2]
first filters product(MPD, PD)
using a selection criteria if p==p2 and d==d2
and then performs a projection from tuples of the form ((m,p,d), (p2,d2))
to tuples of the form (m, p)
to obtain the result.
db.system.js.save({ _id:"dist" , value:function(u, v) {
return Math.pow(u.x - v.x, 2) + Math.pow(u.y - v.y, 2);
}});
function flatten(A) {
db[A].find().forEach(function(a) { db[A].update({_id: a._id}, a.value); });
}
function prod(A, B, AB) {
db[AB].remove({});
db.createCollection(AB);
db[A].find().forEach(function(a) {
db[B].find().forEach(function(b) {
db[AB].insert({left:a, right:b});
});
});
}
function union(A, B, AB) {
db[AB].remove({});
db.createCollection(AB);
db[A].find().forEach(function(a) {
db[AB].insert(a);
});
db[B].find().forEach(function(b) {
db[AB].insert(b);
});
}
function hash_means(M, HASH) {
db[M].mapReduce(
function() { emit("hash", {hash: this.x + this.y}); },
function(k, vs) {
var hash = 0;
vs.forEach(function(v) {
hash += v.hash;
});
return {hash: hash};
},
{out: HASH}
);
}
// We'll only perform a single product operation. Using map-reduce, we can perform
// argmax and argmin more easily. We can also use map-reduce to compare progress.
db.M.remove({});
db.M.insert([{x:13,y:1},{x:2,y:12}]);
db.P.remove({});
db.P.insert([{x:1,y:2},{x:4,y:5},{x:1,y:3},{x:10,y:12},{x:13,y:14},{x:13,y:9},{x:11,y:11}]);
var iterations = 0;
do {
// Compute an initial hash of the means in order to have a baseline
// against which to compare when deciding whether to loop again.
hash_means("M", "HASHOLD");
prod("M", "P", "MP");
// At this point, entries in MP have the form
// {_id:..., left:{x:13,y:1}, right:{x:4,y:5}}.
// For each point, find the distance to the closest mean. The output after
// flattening has entries of the form {_id:{x:?, y:?}, m:{x:?, y:?}, d:?}
// where the identifier is the point.
db.MPs.remove({});
db.MP.mapReduce(
function() {
var point = {x:this.right.x, y:this.right.y};
var mean = {x:this.left.x, y:this.left.y};
emit(point, {m:mean, d:dist(point, mean)});
},
function(point, vs) {
// Each entry in vs is of the form {m:{x:?, y:?}, d:?}.
// We return the one that is closest to point.
var j = 0;
vs.forEach(function(v, i) {
if (v.d < vs[j].d)
j = i;
});
return vs[j]; // Has form {m:{x:?, y:?}, d:?}.
},
{out: "MPs"}
);
// At this point, entries in MPs have the form
// {_id:{x:2, y:3}, value:{m:{x:4, y:5}, d:1}}.
flatten("MPs");
// At this point, entries in MPs have the form
// {_id:{x:2, y:3}, m:{x:4, y:5}, d:1}.
// For each mean (i.e., key), compute the average of all the points that were
// "assigned" to that mean (because it was the closest mean to that point).
db.MPs.mapReduce(
function() {
// The key is the mean and the value is the point together with its counter.
var point = this._id;
var point_with_count = {x:point.x, y:point.y, c:1};
var mean = this.m;
emit(mean, point_with_count);
},
function(key, vs) {
// Remember that the reduce operations will be applied to the values for each key
// in some arbitrary order, so our aggregation operation must be commutative (in
// this case, it is vector addition).
var x = 0, y = 0, c = 0;
vs.forEach(function(v, i) {
x += v.x;
y += v.y;
c += v.c;
});
return {x:x, y:y, c:c};
},
{ finalize: function(k, v) { return {x: v.x/v.c, y: v.y/v.c}; },
out: "M"
}
);
// At this point, entries in M have the form
// {_id:{x:2, y:3}, value:{x:4, y:5}}.
flatten("M");
// At this point, entries in MPs have the form
// {_id:{x:2, y:3}, x:4, y:5}. The identifier
// value does not matter as long as it is unique.
// Compute the hash of the new set of means.
hash_means("M", "HASHNEW");
// Extract the two hashes in order to compare them in the loop condition.
var hashold = db.HASHOLD.find({}).limit(1).toArray()[0].value.hash;
var hashnew = db.HASHNEW.find({}).limit(1).toArray()[0].value.hash;
print(hashold);
print(hashnew);
print(iterations);
iterations++;
} while (hashold != hashnew);
.updateMany()
to distribute the means to all the points at the beginning of each iteration. This leads to a much more concise (and, for a small number of means, efficient) implementation of the algorithm than what is presented in a previous example. In particular, it is no longer necessary to encode a production operation within MongoDB.
db.system.js.save({ _id:"dist" , value:function(u, v) {
return Math.pow(u.x - v.x, 2) + Math.pow(u.y - v.y, 2);
}});
db.P.insert([{x:1,y:2},{x:4,y:5},{x:1,y:3},{x:10,y:12},{x:13,y:14},{x:13,y:9},{x:11,y:11}]);
var means = [{x:13,y:1}, {x:2,y:12}];
do {
db.P.updateMany({}, {$set: {means: means}}); // Add a field to every object.
db.P.mapReduce(
function() {
var closest = this.means[0];
for (var i = 0; i < this.means.length; i++)
if (dist(this.means[i], this) < dist(closest, this))
closest = this.means[i];
emit(closest, {x:this.x, y:this.y, c:1});
},
function(key, vs) {
var x = 0, y = 0, c = 0;
vs.forEach(function(v, i) {
x += v.x;
y += v.y;
c += v.c;
});
return {x:x, y:y, c:c};
},
{ finalize: function(k, v) { return {x: v.x/v.c, y: v.y/v.c}; },
out: "M"
}
);
means = db.M.find().toArray().map(function(r) { return {x:r.value.x, y:r.value.y}; });
printjson(means);
} while (true);
import pymongo
import bson.code
client = pymongo.MongoClient()
db = client.local
db.system.js.save({'_id':'dist', 'value': bson.code.Code("""
function(u, v) {
return Math.pow(u.x - v.x, 2) + Math.pow(u.y - v.y, 2);
}
""")})
db.P.insert_many([{'x':1,'y':2},{'x':4,'y':5},{'x':1,'y':3},{'x':10,'y':12},\
{'x':13,'y':14},{'x':13,'y':9},{'x':11,'y':11}])
means = [{'x':13,'y':1}, {'x':2,'y':12}]
while True:
db.P.update_many({}, {'$set': {'means': means}})
mapper = bson.code.Code("""
function() {
var closest = this.means[0];
for (var i = 0; i < this.means.length; i++)
if (dist(this.means[i], this) < dist(closest, this))
closest = this.means[i];
emit(closest, {x:this.x, y:this.y, c:1});
}
""")
reducer = bson.code.Code("""
function(key, vs) {
var x = 0, y = 0, c = 0;
vs.forEach(function(v, i) {
x += v.x;
y += v.y;
c += v.c;
});
return {x:x, y:y, c:c};
}
""")
finalizer = bson.code.Code("""
function(k, v) { return {x: v.x/v.c, y: v.y/v.c}; }
""")
db.P.map_reduce(mapper, reducer, "M", finalize = finalizer)
means = [{'x':t['value']['x'], 'y':t['value']['y']} for t in db.M.find()]
print(means)
N = ['a','b','c','d','e','f']
E = [('a','b'),('b','c'),('a','c'),('c','d'),('d','e'),('e','f'),('b','f')]
oo = float('inf') # This represents infinite distance.
P = product(N,N)
I = [((x,y),oo if x != y else 0) for (x,y) in P] # Zero distance to self, infinite distance to others.
D = [((x,y),1) for (x,y) in E] # Edge-connected nodes are one apart.
OUTPUT = aggregate(union(I,D), min)
STEP = []
while sorted(STEP) != sorted(OUTPUT):
STEP = OUTPUT
P = product(STEP, STEP) # All pairs of edges.
NEW = union(STEP,[((x,v),k+m) for (((x,y),k),((u,v),m)) in P if u == y]) # Add distances of connected edge pairs.
OUTPUT = aggregate(NEW, min) # Keep only shortest node-node distance entries.
SHORTEST = OUTPUT
coarse | fine | |
how | transformation | execution path through transformation algorithm |
from where | source data sets | specific entries in source data sets |
transformation f | per-entry provenance reconstruction approach |
complexity for input data set with n entries |
union intersection difference selection product |
linear search over input data set to find entry |
O(n) entry equality checks |
projection | application of transformation once to each input data set entry |
O(n) executions of f and O(n) entry equality checks |
aggregation | application of transformation once to each input data set entry and construction of key-to-key map |
O(n) executions of f and O(n log n) to build and use key-to-key map |
| = |
| |||
| = |
|
| ↦ |
| ↦ |
|
| ↦ |
| |||
| ↦ |
|
| = |
| |||
| = |
|
data set transformations
|
f
).
from itertools import combinations, chain
def powerset(iterable):
"powerset([1,2,3]) -> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
def general_prov(f, X, y):
for Y in reversed(powerset(X)): # From largest to smallest subset.
if list(f(list(Y))) == [y]:
return Y
def context_free_prov(f, X, y):
# Build the partitions of the input data set.
partitions = []
for (x_key, x_val) in X:
found = False
for i in range(0,len(partitions)):
if len(f(partitions[i] + [(x_key, x_val)])) == 1:
partitions[i].append((x_key, x_val))
found = True
break
# Create a new partition if adding to any other
# partition increases the size of the output data set.
if found == False:
partitions.append([(x_key, x_val)])
# Find the corresponding partition.
for partition in partitions:
if y in f(partition):
return partition
def key_preserving_prov(f, X, y):
keymap = {}
# Build up the map from input data set
# keys to output data set keys.
for (x_key, x_val) in X:
(y_key, y_val) = f([(x_key, x_val)])[0]
keymap[x_key] = y_key
(y_key, y_val) = y
# Collect all the tuples that contribute
# to the target result.
pre_image = set()
for (x_key, x_val) in X:
if keymap[x_key] == y_key:
pre_image.add((x_key, x_val))
return pre_image
prov
Python package to programmatically assemble a course granularity provenance record that conforms to the PROV standard and describes a particular execution of this script.
doc = prov.model.ProvDocument()
doc.add_namespace('alg', 'http://datamechanics.io/algorithm/alice_bob/') # The scripts in / format.
doc.add_namespace('dat', 'http://datamechanics.io/data/alice_bob/') # The data sets in / format.
doc.add_namespace('ont', 'http://datamechanics.io/ontology#')
doc.add_namespace('log', 'http://datamechanics.io/log#') # The event log.
doc.add_namespace('bdp', 'https://data.cityofboston.gov/resource/')
this_script = doc.agent('alg:example', {prov.model.PROV_TYPE:prov.model.PROV['SoftwareAgent'], 'ont:Extension':'py'})
resource = doc.entity('bdp:wc8w-nujj',
{'prov:label':'311, Service Requests',
prov.model.PROV_TYPE:'ont:DataResource', 'ont:Extension':'json'}
)
this_run = doc.activity(
'log:a'+str(uuid.uuid4()), startTime, endTime,
{prov.model.PROV_TYPE:'ont:Retrieval', 'ont:Query':'?type=Animal+Found&$select=type,latitude,longitude,OPEN_DT'}
)
doc.wasAssociatedWith(this_run, this_script)
doc.used(this_run, resource, startTime)
found = doc.entity('dat:found', {prov.model.PROV_LABEL:'Animals Found', prov.model.PROV_TYPE:'ont:DataSet'})
doc.wasAttributedTo(found, this_script)
doc.wasGeneratedBy(found, this_run, endTime)
doc.wasDerivedFrom(found, resource, this_run, this_run, this_run)
doc.serialize()
.
| = |
|
| |
|
def argmax(X, f):
Y = [f(x) for x in X] # Projection.
y_max = aggregate(Y, max)
XF = [(x, f(x)) for x in X] # Projection.
xs = [x for (x,y) in XF if y == y_max] # Selection.
return xs[0]
| |
|
P = [(1,2),(4,5),(1,3),(10,12),(13,14),(13,9),(11,11)]
def dist(p, q):
(x1,y1) = p
(x2,y2) = q
return (x1-x2)**2 + (y1-y2)**2
S
that consists of all possible positions for the mean within a particular bounded region. The metric would be the sum of the distances from all the points in P
to the mean m
. The solution would be the one that minimizes the metric.
S = list(product(range(0,20), range(0,20))) # Possible locations for one mean.
def metric(m):
return sum([dist(m, p) for p in P])
o = min(S, key=metric)
V = list(product(range(0,20), range(0,20))) # Possible locations for one mean.
S = list(product(V, V)) # Possible pairs of means.
def metric(M):
return sum([min([dist(m, p) for m in M]) for p in P])
o = min(S, key=metric)
|
import z3
(x1,x2,x3,x4,x5,x6,x7) = [z3.Real('x'+str(i)) for i in range(1,8)]
S = z3.Solver()
# Only allow non-negative flows.
for x in (x1,x2,x3,x4,x5,x6,x7):
S.add(x >= 0)
# Edge capacity constraints.
S.add(x2 <= 7, x3 <= 8, x4 <= 6)
S.add(x5 <= 3, x6 <= 4, x7 <= 5)
# Constraints derived from graph topology.
S.add(x1 == x2+x3, x2 == x4+x5, x3+x4 == x6, x5+x6 == x7)
S.add(x1 > 0) # We want a positive flow.
print(S.check())
print(S.model())
flow = 0
for i in range(5, -1, -1):
S.push()
S.add(x1 >= (2**i + flow))
if str(S.check()) != 'unsat':
flow += 2**i
S.pop()
S.add(x1 >= flow)
print(S.model())
def dot(xs, ys):
return sum([x*y for (x,y) in zip(xs, ys)])
x = [x1,x2,x3,x4,x5,x6,x7]
M = [
[ 0,-1, 0, 0, 0, 0, 0],
[ 0, 0,-1, 0, 0, 0, 0],
[ 0, 0, 0,-1, 0, 0, 0],
[ 0, 0, 0, 0,-1, 0, 0],
[ 0, 0, 0, 0, 0,-1, 0],
[ 0, 0, 0, 0, 0, 0,-1],
[ 1, 0, 0, 0, 0, 0, 0],
[ 0, 1, 0, 0, 0, 0, 0],
[ 0, 0, 1, 0, 0, 0, 0],
[ 0, 0, 0, 1, 0, 0, 0],
[ 0, 0, 0, 0, 1, 0, 0],
[ 0, 0, 0, 0, 0, 1, 0],
[ 0, 0, 0, 0, 0, 0, 1],
[-1, 1, 1, 0, 0, 0, 0],
[ 0,-1, 0, 1, 1, 0, 0],
[ 0, 0,-1,-1, 0, 1, 0],
[ 0, 0, 0, 0,-1,-1, 1]
]
b = [-7,-8,-6,-3,-4,-5,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]
S = z3.Solver()
for i in range(len(M)):
S.add(b[i] <= dot(M[i], x))
print(S.check())
print(S.model())
from scipy.optimize import linprog
M = [
[ 1,-1,-1, 0, 0, 0, 0],
[ 0, 1, 0,-1,-1, 0, 0],
[ 0, 0, 1, 1, 0,-1, 0],
[ 0, 0, 0, 0, 1, 1,-1],
]
b = [0, 0, 0, 0]
c = [-1, 0, 0, 0, 0, 0, 0]
bounds = [(0, None), (0, 7), (0, 8), (0, 6), (0, 3), (0, 4), (0, 5)]
result = linprog(c, A_ub = M, b_ub = b, bounds=bounds, options={"disp": True})
print(result)
| |
| |
|
| = |
|
| = |
| |||
= |
| ||||
= |
| ||||
= |
|
| ≤ |
|
| ≤ |
|
| = |
|
| = |
|
| = |
| |||
= |
| ||||
= |
| ||||
= |
|
| = |
|
| = |
|
| = |
| = |
| |||||
| = |
|
| = |
|
| = |
| |||
= |
|
| = |
| |||
| = |
| |||
| = |
| |||
| = |
| |||
⋮ | |||||
| = |
|
| = |
| |||
= |
|
| = |
| |||
= |
|
| = |
| |||
= |
| ||||
= |
|
| = |
| |||
= |
| ||||
= |
| ||||
= |
|
| = |
|
| = |
| |||
= |
| ||||
= |
| ||||
= |
|
| ≤ |
| ≤ |
|
| = |
| = |
|
from random import shuffle
from math import sqrt
data = [(18, 28), (24, 18), (27, 31), (14, 15), (46, 23),
(36, 19), (27, 10), (34, 25), (19, 15), (13, 13),
(4, 2), (17, 20), (28, 12), (36, 11), (26, 14),
(19, 19), (24, 13), (25, 6), (20, 8), (17, 22),
(18, 8), (25, 12), (28, 27), (31, 28), (35, 22),
(17, 8), (19, 19), (23, 23), (22, 11)]
x = [xi for (xi, yi) in data]
y = [yi for (xi, yi) in data]
def permute(x):
shuffled = [xi for xi in x]
shuffle(shuffled)
return shuffled
def avg(x): # Average
return sum(x)/len(x)
def stddev(x): # Standard deviation.
m = avg(x)
return sqrt(sum([(xi-m)**2 for xi in x])/len(x))
def cov(x, y): # Covariance.
return sum([(xi-avg(x))*(yi-avg(y)) for (xi,yi) in zip(x,y)])/len(x)
def corr(x, y): # Correlation coefficient.
if stddev(x)*stddev(y) != 0:
return cov(x, y)/(stddev(x)*stddev(y))
def p(x, y):
c0 = corr(x, y)
corrs = []
for k in range(0, 2000):
y_permuted = permute(y)
corrs.append(corr(x, y_permuted))
return len([c for c in corrs if abs(c) >= abs(c0)])/len(corrs)
scipy.stats.pearsonr()
function will return the correlation coefficient and the p-value.
import scipy.stats
print(scipy.stats.pearsonr(x, y))
| > |
| |||
| > |
| |||
| > |
|
| = |
| = |
|
| = |
|
| = |
| |||
| = |
|
import jsonschema
from flask import Flask, jsonify, abort, make_response, request
from flask.ext.httpauth import HTTPBasicAuth
app = Flask(__name__)
auth = HTTPBasicAuth()
users = [
{ 'id': 1, 'username': u'alice' },
{ 'id': 2, 'username': u'bob' }
]
schema = {
"type": "object",
"properties": {"username" : {"type": "string"}},
"required": ["username"]
}
@app.route('/client', methods=['OPTIONS'])
def show_api():
return jsonify(schema)
@app.route('/client', methods=['GET'])
@auth.login_required
def show_client():
return open('client.html','r').read()
@app.route('/app/api/v0.1/users', methods=['GET'])
def get_users(): # Server-side reusable name for function.
print("I'm responding.")
return jsonify({'users': users})
@app.route('/app/api/v0.1/users/', methods=['GET'])
def get_user(user_id):
user = [user for user in users if user['id'] == user_id]
if len(user) == 0:
abort(404)
return jsonify({'user': user[0]})
@app.errorhandler(404)
def not_found(error):
return make_response(jsonify({'error': 'Not found foo.'}), 404)
@app.route('/app/api/v0.1/users', methods=['POST'])
def create_user():
print(request.json)
if not request.json:
print('Request not valid JSON.')
abort(400)
try:
jsonschema.validate(request.json, schema)
user = { 'id': users[-1]['id'] + 1, 'username': request.json['username'] }
users.append(user)
print(users)
return jsonify({'user': user}), 201
except:
print('Request does not follow schema.')
abort(400)
@auth.get_password
def foo(username):
if username == 'alice':
return 'ecila'
return None
@auth.error_handler
def unauthorized():
return make_response(jsonify({'error': 'Unauthorized access.'}), 401)
if __name__ == '__main__':
app.run(debug=True)
<script>
function postSomething() {
var http = new XMLHttpRequest();
var url = "http://localhost:5000/app/api/v0.1/users";
var json = JSON.stringify({'name':'carl'});
http.open("POST", url, true);
http.setRequestHeader("Content-type", "application/json");
http.onreadystatechange = function() {
if (http.readyState == 4 && http.status == 200) {
console.log(http.responseText);
}
}
http.send(json);
}
</script>
<button onclick="postSomething();">Post new user</button>
[1] | "World's population increasingly urban with more than half living in urban areas". https://www.un.org/development/desa/en/news/population/world-urbanization-prospects.html |
[2] | Luís M. A. Bettencourt, José Lobo, Dirk Helbing, Christian Kühnert, and Geoffrey B. West. "Growth, innovation, scaling, and the pace of life in cities". Proceedings of the National Academy of Sciences of the United States of America 2007;104(17):7301-7306. http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1852329/ |
[3] | Robert Albright, and Alan Demers, Johannes Gehrke, Nitin Gupta, Hooyeon Lee, Rick Keilty, Gregory Sadowski, Ben Sowell, and Walker White. "SGL: A Scalable Language for Data-driven Games". Proceedings of the 2008 ACM SIGMOD International Conference on Management of Data 2008. http://www.cs.cornell.edu/~sowell/2008-sigmod-games-demo.pdf |
[4] | Walker White, Benjamin Sowell, Johannes Gehrke, and Alan Demers. "Declarative Processing for Computer Games". Proceedings of the 2008 ACM SIGGRAPH Symposium on Video Games 2008. http://www.cs.cornell.edu/~sowell/2008-sandbox-declarative.pdf |
[5] | W3C Working Group. "PROV Model Primer". https://www.w3.org/TR/prov-primer/ |
[6] | Robert Ikeda and Jennifer Widom. "Data Lineage: A Survey". http://ilpubs.stanford.edu:8090/918/1/lin_final.pdf |
[7] | Y. Cui and J. Widom. "Lineage Tracing for General Data Warehouse Transformations". The VLDB Journal 2003;12(1):41-58. http://ilpubs.stanford.edu:8090/525/1/2001-5.pdf |
[8] | Gerd Gigerenzer. "Mindless statistics". The Journal of Socio-Economics 2004;33(5):587-606. http://www.unh.edu/halelab/BIOL933/papers/2004_Gigerenzer_JSE.pdf |
[9] | Nihar B. Shah and Dengyong Zhou. "Double or Nothing: Multiplicative Incentive Mechanisms for Crowdsourcing". CoRR 2014. http://www.eecs.berkeley.edu/~nihar/publications/double_or_nothing.pdf |
mapReduce
operations: https://jira.mongodb.org/browse/SERVER-2517 andeval()
and stored procedure capabilities: https://jira.mongodb.org/browse/SERVER-17453.GEO2D
index using PyMongo, andprov
package:
optimize.linprog
module for solving linear optimization problems, and the library is relatively straightforward to install:
pip install numpy
and pip install scipy
should be sufficient (you may need to download and install a Fortran compiler first under some configurations).