Practical Computing for Biologists by Cliburn Chan
#To clone or copy a list in python
import copy
class Foo(object):
def __init__(self, val):
self.val = val
def __repr__(self):
return str(self.val)
foo = Foo(1)
a = ['foo', foo]
b = a[:]
c = list(a)
d = copy.copy(a)
e = copy.deepcopy(a)
# edit orignal list and instance
a.append('baz')
foo.val = 5
print('original: %r\n slice: %r\n list(): %r\n copy: %r\n deepcopy: %r'
% (a, b, c, d, e))
#Creating a list of lists (sublists changed)
myList = [[[1] * 4] for n in range(3)]
#lst1 = [1]*4; lst = [lst1]*3
print myList
myList[0][0] = 5
print myList
#check if a word is a palindrome
word = raw_input("Enter a word: ")
if word == word[::-1]:
print "%s is a palindrome!" % word
else:
print "%s is not palindrome" % word
#count the number of vowels in a word
name = raw_input("What’s your name? ")
num_vowels = 0
for vowel in ’aeiou’:
num_vowels += name.count(vowel)
print "Hello %s, there are %d vowels in your name." % (name, num_vowels)
#Substitution of regex
import re
find = r’(\d+)\s+(\w{3})[\w\,\.]*\s+(\d+)\sat\s(\d+):(\d+)\s+([-\d\.]+)\s+([-\d\.]+).*’
replace = r’\3\t\2.\t\1\t\4\t\5\t\6\t\7’
for line in open(’examples/Ch3observations.txt’):
newline = re.sub(find, replace, line)
print newline,
#Fitting the curve (Using 4 parameters, logistic equation). A plot will be generated
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
from scipy.optimize import leastsq
def logistic4(x, A, B, C, D):
"""4PL lgoistic equation."""
return ((A-D)/(1.0+((x/C)**B))) + D
def residuals(p, y, x):
"""Deviations of data from fitted 4PL curve"""
A,B,C,D = p
err = y-logistic4(x, A, B, C, D)
return err
def peval(x, p):
"""Evaluated value at x with current parameters."""
A,B,C,D = p
return logistic4(x, A, B, C, D)
# Make up some data for fitting and add noise
# In practice, y_meas would be read in from a file
x = np.linspace(0,20,20)
A,B,C,D = 0.5,2.5,8,7.3
y_true = logistic4(x, A, B, C, D)
y_meas = y_true + 0.2*npr.randn(len(x))
# Initial guess for parameters
p0 = [0, 1, 1, 1]
# Fit equation using least squares optimization
plsq = leastsq(residuals, p0, args=(y_meas, x))
# Plot results
plt.plot(x,peval(x,plsq[0]),x,y_meas,’o’,x,y_true)
plt.title(’Least-squares 4PL fit to noisy data’)
plt.legend([’Fit’, ’Noisy’, ’True’], loc=’upper left’)
for i, (param, actual, est) in enumerate(zip(’ABCD’, [A,B,C,D], plsq[0])):
plt.text(10, 3-i*0.5, ’%s = %.2f, est(%s) = %.2f’ % (param, actual, param, est))
plt.savefig(’logistic.png’)
#Simulation-based statistics (bootstrap and permuation resampling)
#Sampling without replacement
import numpy.random as npr
npr.random(5)
npr.random((3,4))
npr.normal(5, 1, 4)
npr.randint(1, 7, 10)
npr.uniform(1, 7, 10)
npr.binomial(n=10, p=0.2, size=(4,4))
x = [1,2,3,4,5,6]
npr.shuffle(x)
x
npr.permutation(10)
----------
#Sampling with replacement
import numpy as np
import numpy.random as npr
data = np.array([’tom’, ’jerry’, ’mickey’, ’minnie’, ’pocahontas’])
idx = npr.randint(0, len(data), (4,len(data)))
idx
samples_with_replacement = data[idx]
samples_with_replacement
#Bootstrapping ( higher order function)
import numpy as np
import numpy.random as npr
import pylab
def bootstrap(data, num_samples, statistic, alpha):
"""Returns bootstrap estimate of 100.0*(1-alpha) CI for statistic."""
n = len(data)
idx = npr.randint(0, n, (num_samples, n))
samples = x[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
stat[int((1-alpha/2.0)*num_samples)])
if __name__ == ’__main__’:
# data of interest is bimodal and obviously not normal
x = np.concatenate([npr.normal(3, 1, 100), npr.normal(6, 2, 200)])
# find mean 95% CI and 100,000 bootstrap samples
low, high = bootstrap(x, 100000, np.mean, 0.05)
# make plots
pylab.figure(figsize=(8,4))
pylab.subplot(121)
pylab.hist(x, 50, histtype=’step’)
pylab.title(’Historgram of data’)
pylab.subplot(122)
pylab.plot([-0.03,0.03], [np.mean(x), np.mean(x)], ’r’, linewidth=2)
pylab.scatter(0.1*(npr.random(len(x))-0.5), x)
pylab.plot([0.19,0.21], [low, low], ’r’, linewidth=2)
pylab.plot([0.19,0.21], [high, high], ’r’, linewidth=2)
pylab.plot([0.2,0.2], [low, high], ’r’, linewidth=2)
pylab.xlim([-0.2, 0.3])
pylab.title(’Bootstrap 95% CI for mean’)
pylab.savefig(’examples/boostrap.png’)
low, high = bootstrap(x, 100000, np.std, 0.05)
#Permutation sampling (to find p-value)
import numpy as np
import numpy.random as npr
import pylab
def permutation_resampling(case, control, num_samples, statistic):
"""Returns p-value that statistic for case is different
from statistc for control."""
observed_diff = abs(statistic(case) - statistic(control))
num_case = len(case)
combined = np.concatenate([case, control])
diffs = []
for i in range(num_samples):
xs = npr.permutation(combined)
diff = np.mean(xs[:num_case]) - np.mean(xs[num_case:])
diffs.append(diff)
pval = (np.sum(diffs > observed_diff) +
np.sum(diffs < -observed_diff))/float(num_samples)
return pval, observed_diff, diffs
if __name__ == ’__main__’:
# make up some data
case = [94, 38, 23, 197, 99, 16, 141]
control = [52, 10, 40, 104, 51, 27, 146, 30, 46]
# find p-value by permutation resampling
pval, observed_diff, diffs = \
permutation_resampling(case, control, 10000, np.mean)
# make plots
pylab.title(’Empirical null distribution for differences in mean’)
pylab.hist(diffs, bins=100, histtype=’step’, normed=True)
pylab.axvline(observed_diff, c=’red’, label=’diff’)
pylab.axvline(-observed_diff, c=’green’, label=’-diff’)
pylab.text(60, 0.01, ’p = %.3f’ % pval, fontsize=16)
pylab.legend()
pylab.savefig(’examples/permutation.png’)
#Data visualization (for exploratory data analysis)
import numpy as np
import pylab
xs = np.loadtxt(’anscombe.txt’)
for i in range(4):
x = xs[:,i*2]
y = xs[:,i*2+1]
A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y)[0]
pylab.subplot(2,2,i+1)
pylab.scatter(x, y)
pylab.plot(x, m*x+c, ’r’)
pylab.axis([2,20,0,14])
pylab.savefig(’anscombe.png’)
#Working with relational databases (connect, execute, iterate)
import sqlite3
con = sqlite3.connect(’pcfb.sqlite’)
r = con.execute(’select * from people’)
for i in r:
print i
r = con.execute(’select p.name, e.name from people as p join experiment as e where e.researcher == p.id
for i in r:
print ’Name: %s\n\tExperiment: %s’ % (i[0],i[1])
#source code available from git repository. Need to self-compile
#easy_install/pip installation
#To clone or copy a list in python
import copy
class Foo(object):
def __init__(self, val):
self.val = val
def __repr__(self):
return str(self.val)
foo = Foo(1)
a = ['foo', foo]
b = a[:]
c = list(a)
d = copy.copy(a)
e = copy.deepcopy(a)
# edit orignal list and instance
a.append('baz')
foo.val = 5
print('original: %r\n slice: %r\n list(): %r\n copy: %r\n deepcopy: %r'
% (a, b, c, d, e))
#Creating a list of lists (sublists changed)
myList = [[[1] * 4] for n in range(3)]
#lst1 = [1]*4; lst = [lst1]*3
print myList
myList[0][0] = 5
print myList
#check if a word is a palindrome
word = raw_input("Enter a word: ")
if word == word[::-1]:
print "%s is a palindrome!" % word
else:
print "%s is not palindrome" % word
#count the number of vowels in a word
name = raw_input("What’s your name? ")
num_vowels = 0
for vowel in ’aeiou’:
num_vowels += name.count(vowel)
print "Hello %s, there are %d vowels in your name." % (name, num_vowels)
#Substitution of regex
import re
find = r’(\d+)\s+(\w{3})[\w\,\.]*\s+(\d+)\sat\s(\d+):(\d+)\s+([-\d\.]+)\s+([-\d\.]+).*’
replace = r’\3\t\2.\t\1\t\4\t\5\t\6\t\7’
for line in open(’examples/Ch3observations.txt’):
newline = re.sub(find, replace, line)
print newline,
#Fitting the curve (Using 4 parameters, logistic equation). A plot will be generated
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
from scipy.optimize import leastsq
def logistic4(x, A, B, C, D):
"""4PL lgoistic equation."""
return ((A-D)/(1.0+((x/C)**B))) + D
def residuals(p, y, x):
"""Deviations of data from fitted 4PL curve"""
A,B,C,D = p
err = y-logistic4(x, A, B, C, D)
return err
def peval(x, p):
"""Evaluated value at x with current parameters."""
A,B,C,D = p
return logistic4(x, A, B, C, D)
# Make up some data for fitting and add noise
# In practice, y_meas would be read in from a file
x = np.linspace(0,20,20)
A,B,C,D = 0.5,2.5,8,7.3
y_true = logistic4(x, A, B, C, D)
y_meas = y_true + 0.2*npr.randn(len(x))
# Initial guess for parameters
p0 = [0, 1, 1, 1]
# Fit equation using least squares optimization
plsq = leastsq(residuals, p0, args=(y_meas, x))
# Plot results
plt.plot(x,peval(x,plsq[0]),x,y_meas,’o’,x,y_true)
plt.title(’Least-squares 4PL fit to noisy data’)
plt.legend([’Fit’, ’Noisy’, ’True’], loc=’upper left’)
for i, (param, actual, est) in enumerate(zip(’ABCD’, [A,B,C,D], plsq[0])):
plt.text(10, 3-i*0.5, ’%s = %.2f, est(%s) = %.2f’ % (param, actual, param, est))
plt.savefig(’logistic.png’)
#Simulation-based statistics (bootstrap and permuation resampling)
#Sampling without replacement
import numpy.random as npr
npr.random(5)
npr.random((3,4))
npr.normal(5, 1, 4)
npr.randint(1, 7, 10)
npr.uniform(1, 7, 10)
npr.binomial(n=10, p=0.2, size=(4,4))
x = [1,2,3,4,5,6]
npr.shuffle(x)
x
npr.permutation(10)
----------
#Sampling with replacement
import numpy as np
import numpy.random as npr
data = np.array([’tom’, ’jerry’, ’mickey’, ’minnie’, ’pocahontas’])
idx = npr.randint(0, len(data), (4,len(data)))
idx
samples_with_replacement = data[idx]
samples_with_replacement
#Bootstrapping ( higher order function)
import numpy as np
import numpy.random as npr
import pylab
def bootstrap(data, num_samples, statistic, alpha):
"""Returns bootstrap estimate of 100.0*(1-alpha) CI for statistic."""
n = len(data)
idx = npr.randint(0, n, (num_samples, n))
samples = x[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
stat[int((1-alpha/2.0)*num_samples)])
if __name__ == ’__main__’:
# data of interest is bimodal and obviously not normal
x = np.concatenate([npr.normal(3, 1, 100), npr.normal(6, 2, 200)])
# find mean 95% CI and 100,000 bootstrap samples
low, high = bootstrap(x, 100000, np.mean, 0.05)
# make plots
pylab.figure(figsize=(8,4))
pylab.subplot(121)
pylab.hist(x, 50, histtype=’step’)
pylab.title(’Historgram of data’)
pylab.subplot(122)
pylab.plot([-0.03,0.03], [np.mean(x), np.mean(x)], ’r’, linewidth=2)
pylab.scatter(0.1*(npr.random(len(x))-0.5), x)
pylab.plot([0.19,0.21], [low, low], ’r’, linewidth=2)
pylab.plot([0.19,0.21], [high, high], ’r’, linewidth=2)
pylab.plot([0.2,0.2], [low, high], ’r’, linewidth=2)
pylab.xlim([-0.2, 0.3])
pylab.title(’Bootstrap 95% CI for mean’)
pylab.savefig(’examples/boostrap.png’)
low, high = bootstrap(x, 100000, np.std, 0.05)
#Permutation sampling (to find p-value)
import numpy as np
import numpy.random as npr
import pylab
def permutation_resampling(case, control, num_samples, statistic):
"""Returns p-value that statistic for case is different
from statistc for control."""
observed_diff = abs(statistic(case) - statistic(control))
num_case = len(case)
combined = np.concatenate([case, control])
diffs = []
for i in range(num_samples):
xs = npr.permutation(combined)
diff = np.mean(xs[:num_case]) - np.mean(xs[num_case:])
diffs.append(diff)
pval = (np.sum(diffs > observed_diff) +
np.sum(diffs < -observed_diff))/float(num_samples)
return pval, observed_diff, diffs
if __name__ == ’__main__’:
# make up some data
case = [94, 38, 23, 197, 99, 16, 141]
control = [52, 10, 40, 104, 51, 27, 146, 30, 46]
# find p-value by permutation resampling
pval, observed_diff, diffs = \
permutation_resampling(case, control, 10000, np.mean)
# make plots
pylab.title(’Empirical null distribution for differences in mean’)
pylab.hist(diffs, bins=100, histtype=’step’, normed=True)
pylab.axvline(observed_diff, c=’red’, label=’diff’)
pylab.axvline(-observed_diff, c=’green’, label=’-diff’)
pylab.text(60, 0.01, ’p = %.3f’ % pval, fontsize=16)
pylab.legend()
pylab.savefig(’examples/permutation.png’)
#Data visualization (for exploratory data analysis)
import numpy as np
import pylab
xs = np.loadtxt(’anscombe.txt’)
for i in range(4):
x = xs[:,i*2]
y = xs[:,i*2+1]
A = np.vstack([x, np.ones(len(x))]).T
m, c = np.linalg.lstsq(A, y)[0]
pylab.subplot(2,2,i+1)
pylab.scatter(x, y)
pylab.plot(x, m*x+c, ’r’)
pylab.axis([2,20,0,14])
pylab.savefig(’anscombe.png’)
#Working with relational databases (connect, execute, iterate)
import sqlite3
con = sqlite3.connect(’pcfb.sqlite’)
r = con.execute(’select * from people’)
for i in r:
print i
r = con.execute(’select p.name, e.name from people as p join experiment as e where e.researcher == p.id
for i in r:
print ’Name: %s\n\tExperiment: %s’ % (i[0],i[1])
#source code available from git repository. Need to self-compile
#easy_install/pip installation
No comments:
Post a Comment