Practical Computing for Biologists by Cliburn Chan

#To clone or copy a list in python

import copy

class Foo(object):

def __init__(self, val):

self.val = val

def __repr__(self):

return str(self.val)

foo = Foo(1)

a = ['foo', foo]

b = a[:]

c = list(a)

d = copy.copy(a)

e = copy.deepcopy(a)

# edit orignal list and instance

a.append('baz')

foo.val = 5

print('original: %r\n slice: %r\n list(): %r\n copy: %r\n deepcopy: %r'

% (a, b, c, d, e))

#Creating a list of lists (sublists changed)

myList = [[[1] * 4] for n in range(3)]

#lst1 = [1]*4; lst = [lst1]*3

print myList

myList[0][0] = 5

print myList

#check if a word is a palindrome

word = raw_input("Enter a word: ")

if word == word[::-1]:

print "%s is a palindrome!" % word

else:

print "%s is not palindrome" % word

#count the number of vowels in a word

name = raw_input("What’s your name? ")

num_vowels = 0

for vowel in ’aeiou’:

num_vowels += name.count(vowel)

print "Hello %s, there are %d vowels in your name." % (name, num_vowels)

#Substitution of regex

import re

find = r’(\d+)\s+(\w{3})[\w\,\.]*\s+(\d+)\sat\s(\d+):(\d+)\s+([-\d\.]+)\s+([-\d\.]+).*’

replace = r’\3\t\2.\t\1\t\4\t\5\t\6\t\7’

for line in open(’examples/Ch3observations.txt’):

newline = re.sub(find, replace, line)

print newline,

#Fitting the curve (Using 4 parameters, logistic equation). A plot will be generated

import numpy as np

import numpy.random as npr

import matplotlib.pyplot as plt

from scipy.optimize import leastsq

def logistic4(x, A, B, C, D):

"""4PL lgoistic equation."""

return ((A-D)/(1.0+((x/C)**B))) + D

def residuals(p, y, x):

"""Deviations of data from fitted 4PL curve"""

A,B,C,D = p

err = y-logistic4(x, A, B, C, D)

return err

def peval(x, p):

"""Evaluated value at x with current parameters."""

A,B,C,D = p

return logistic4(x, A, B, C, D)

# Make up some data for fitting and add noise

# In practice, y_meas would be read in from a file

x = np.linspace(0,20,20)

A,B,C,D = 0.5,2.5,8,7.3

y_true = logistic4(x, A, B, C, D)

y_meas = y_true + 0.2*npr.randn(len(x))

# Initial guess for parameters

p0 = [0, 1, 1, 1]

# Fit equation using least squares optimization

plsq = leastsq(residuals, p0, args=(y_meas, x))

# Plot results

plt.plot(x,peval(x,plsq[0]),x,y_meas,’o’,x,y_true)

plt.title(’Least-squares 4PL fit to noisy data’)

plt.legend([’Fit’, ’Noisy’, ’True’], loc=’upper left’)

for i, (param, actual, est) in enumerate(zip(’ABCD’, [A,B,C,D], plsq[0])):

plt.text(10, 3-i*0.5, ’%s = %.2f, est(%s) = %.2f’ % (param, actual, param, est))

plt.savefig(’logistic.png’)

#Simulation-based statistics (bootstrap and permuation resampling)

#Sampling without replacement

import numpy.random as npr

npr.random(5)

npr.random((3,4))

npr.normal(5, 1, 4)

npr.randint(1, 7, 10)

npr.uniform(1, 7, 10)

npr.binomial(n=10, p=0.2, size=(4,4))

x = [1,2,3,4,5,6]

npr.shuffle(x)

x

npr.permutation(10)

----------

#Sampling with replacement

import numpy as np

import numpy.random as npr

data = np.array([’tom’, ’jerry’, ’mickey’, ’minnie’, ’pocahontas’])

idx = npr.randint(0, len(data), (4,len(data)))

idx

samples_with_replacement = data[idx]

samples_with_replacement

#Bootstrapping ( higher order function)

import numpy as np

import numpy.random as npr

import pylab

def bootstrap(data, num_samples, statistic, alpha):

"""Returns bootstrap estimate of 100.0*(1-alpha) CI for statistic."""

n = len(data)

idx = npr.randint(0, n, (num_samples, n))

samples = x[idx]

stat = np.sort(statistic(samples, 1))

return (stat[int((alpha/2.0)*num_samples)],

stat[int((1-alpha/2.0)*num_samples)])

if __name__ == ’__main__’:

# data of interest is bimodal and obviously not normal

x = np.concatenate([npr.normal(3, 1, 100), npr.normal(6, 2, 200)])

# find mean 95% CI and 100,000 bootstrap samples

low, high = bootstrap(x, 100000, np.mean, 0.05)

# make plots

pylab.figure(figsize=(8,4))

pylab.subplot(121)

pylab.hist(x, 50, histtype=’step’)

pylab.title(’Historgram of data’)

pylab.subplot(122)

pylab.plot([-0.03,0.03], [np.mean(x), np.mean(x)], ’r’, linewidth=2)

pylab.scatter(0.1*(npr.random(len(x))-0.5), x)

pylab.plot([0.19,0.21], [low, low], ’r’, linewidth=2)

pylab.plot([0.19,0.21], [high, high], ’r’, linewidth=2)

pylab.plot([0.2,0.2], [low, high], ’r’, linewidth=2)

pylab.xlim([-0.2, 0.3])

pylab.title(’Bootstrap 95% CI for mean’)

pylab.savefig(’examples/boostrap.png’)

low, high = bootstrap(x, 100000, np.std, 0.05)

#Permutation sampling (to find p-value)

import numpy as np

import numpy.random as npr

import pylab

def permutation_resampling(case, control, num_samples, statistic):

"""Returns p-value that statistic for case is different

from statistc for control."""

observed_diff = abs(statistic(case) - statistic(control))

num_case = len(case)

combined = np.concatenate([case, control])

diffs = []

for i in range(num_samples):

xs = npr.permutation(combined)

diff = np.mean(xs[:num_case]) - np.mean(xs[num_case:])

diffs.append(diff)

pval = (np.sum(diffs > observed_diff) +

np.sum(diffs < -observed_diff))/float(num_samples)

return pval, observed_diff, diffs

if __name__ == ’__main__’:

# make up some data

case = [94, 38, 23, 197, 99, 16, 141]

control = [52, 10, 40, 104, 51, 27, 146, 30, 46]

# find p-value by permutation resampling

pval, observed_diff, diffs = \

permutation_resampling(case, control, 10000, np.mean)

# make plots

pylab.title(’Empirical null distribution for differences in mean’)

pylab.hist(diffs, bins=100, histtype=’step’, normed=True)

pylab.axvline(observed_diff, c=’red’, label=’diff’)

pylab.axvline(-observed_diff, c=’green’, label=’-diff’)

pylab.text(60, 0.01, ’p = %.3f’ % pval, fontsize=16)

pylab.legend()

pylab.savefig(’examples/permutation.png’)

#Data visualization (for exploratory data analysis)

import numpy as np

import pylab

xs = np.loadtxt(’anscombe.txt’)

for i in range(4):

x = xs[:,i*2]

y = xs[:,i*2+1]

A = np.vstack([x, np.ones(len(x))]).T

m, c = np.linalg.lstsq(A, y)[0]

pylab.subplot(2,2,i+1)

pylab.scatter(x, y)

pylab.plot(x, m*x+c, ’r’)

pylab.axis([2,20,0,14])

pylab.savefig(’anscombe.png’)

#Working with relational databases (connect, execute, iterate)

import sqlite3

con = sqlite3.connect(’pcfb.sqlite’)

r = con.execute(’select * from people’)

for i in r:

print i

r = con.execute(’select p.name, e.name from people as p join experiment as e where e.researcher == p.id

for i in r:

print ’Name: %s\n\tExperiment: %s’ % (i[0],i[1])

#source code available from git repository. Need to self-compile

#easy_install/pip installation

#To clone or copy a list in python

import copy

class Foo(object):

def __init__(self, val):

self.val = val

def __repr__(self):

return str(self.val)

foo = Foo(1)

a = ['foo', foo]

b = a[:]

c = list(a)

d = copy.copy(a)

e = copy.deepcopy(a)

# edit orignal list and instance

a.append('baz')

foo.val = 5

print('original: %r\n slice: %r\n list(): %r\n copy: %r\n deepcopy: %r'

% (a, b, c, d, e))

#Creating a list of lists (sublists changed)

myList = [[[1] * 4] for n in range(3)]

#lst1 = [1]*4; lst = [lst1]*3

print myList

myList[0][0] = 5

print myList

#check if a word is a palindrome

word = raw_input("Enter a word: ")

if word == word[::-1]:

print "%s is a palindrome!" % word

else:

print "%s is not palindrome" % word

#count the number of vowels in a word

name = raw_input("What’s your name? ")

num_vowels = 0

for vowel in ’aeiou’:

num_vowels += name.count(vowel)

print "Hello %s, there are %d vowels in your name." % (name, num_vowels)

#Substitution of regex

import re

find = r’(\d+)\s+(\w{3})[\w\,\.]*\s+(\d+)\sat\s(\d+):(\d+)\s+([-\d\.]+)\s+([-\d\.]+).*’

replace = r’\3\t\2.\t\1\t\4\t\5\t\6\t\7’

for line in open(’examples/Ch3observations.txt’):

newline = re.sub(find, replace, line)

print newline,

#Fitting the curve (Using 4 parameters, logistic equation). A plot will be generated

import numpy as np

import numpy.random as npr

import matplotlib.pyplot as plt

from scipy.optimize import leastsq

def logistic4(x, A, B, C, D):

"""4PL lgoistic equation."""

return ((A-D)/(1.0+((x/C)**B))) + D

def residuals(p, y, x):

"""Deviations of data from fitted 4PL curve"""

A,B,C,D = p

err = y-logistic4(x, A, B, C, D)

return err

def peval(x, p):

"""Evaluated value at x with current parameters."""

A,B,C,D = p

return logistic4(x, A, B, C, D)

# Make up some data for fitting and add noise

# In practice, y_meas would be read in from a file

x = np.linspace(0,20,20)

A,B,C,D = 0.5,2.5,8,7.3

y_true = logistic4(x, A, B, C, D)

y_meas = y_true + 0.2*npr.randn(len(x))

# Initial guess for parameters

p0 = [0, 1, 1, 1]

# Fit equation using least squares optimization

plsq = leastsq(residuals, p0, args=(y_meas, x))

# Plot results

plt.plot(x,peval(x,plsq[0]),x,y_meas,’o’,x,y_true)

plt.title(’Least-squares 4PL fit to noisy data’)

plt.legend([’Fit’, ’Noisy’, ’True’], loc=’upper left’)

for i, (param, actual, est) in enumerate(zip(’ABCD’, [A,B,C,D], plsq[0])):

plt.text(10, 3-i*0.5, ’%s = %.2f, est(%s) = %.2f’ % (param, actual, param, est))

plt.savefig(’logistic.png’)

#Simulation-based statistics (bootstrap and permuation resampling)

#Sampling without replacement

import numpy.random as npr

npr.random(5)

npr.random((3,4))

npr.normal(5, 1, 4)

npr.randint(1, 7, 10)

npr.uniform(1, 7, 10)

npr.binomial(n=10, p=0.2, size=(4,4))

x = [1,2,3,4,5,6]

npr.shuffle(x)

x

npr.permutation(10)

----------

#Sampling with replacement

import numpy as np

import numpy.random as npr

data = np.array([’tom’, ’jerry’, ’mickey’, ’minnie’, ’pocahontas’])

idx = npr.randint(0, len(data), (4,len(data)))

idx

samples_with_replacement = data[idx]

samples_with_replacement

#Bootstrapping ( higher order function)

import numpy as np

import numpy.random as npr

import pylab

def bootstrap(data, num_samples, statistic, alpha):

"""Returns bootstrap estimate of 100.0*(1-alpha) CI for statistic."""

n = len(data)

idx = npr.randint(0, n, (num_samples, n))

samples = x[idx]

stat = np.sort(statistic(samples, 1))

return (stat[int((alpha/2.0)*num_samples)],

stat[int((1-alpha/2.0)*num_samples)])

if __name__ == ’__main__’:

# data of interest is bimodal and obviously not normal

x = np.concatenate([npr.normal(3, 1, 100), npr.normal(6, 2, 200)])

# find mean 95% CI and 100,000 bootstrap samples

low, high = bootstrap(x, 100000, np.mean, 0.05)

# make plots

pylab.figure(figsize=(8,4))

pylab.subplot(121)

pylab.hist(x, 50, histtype=’step’)

pylab.title(’Historgram of data’)

pylab.subplot(122)

pylab.plot([-0.03,0.03], [np.mean(x), np.mean(x)], ’r’, linewidth=2)

pylab.scatter(0.1*(npr.random(len(x))-0.5), x)

pylab.plot([0.19,0.21], [low, low], ’r’, linewidth=2)

pylab.plot([0.19,0.21], [high, high], ’r’, linewidth=2)

pylab.plot([0.2,0.2], [low, high], ’r’, linewidth=2)

pylab.xlim([-0.2, 0.3])

pylab.title(’Bootstrap 95% CI for mean’)

pylab.savefig(’examples/boostrap.png’)

low, high = bootstrap(x, 100000, np.std, 0.05)

#Permutation sampling (to find p-value)

import numpy as np

import numpy.random as npr

import pylab

def permutation_resampling(case, control, num_samples, statistic):

"""Returns p-value that statistic for case is different

from statistc for control."""

observed_diff = abs(statistic(case) - statistic(control))

num_case = len(case)

combined = np.concatenate([case, control])

diffs = []

for i in range(num_samples):

xs = npr.permutation(combined)

diff = np.mean(xs[:num_case]) - np.mean(xs[num_case:])

diffs.append(diff)

pval = (np.sum(diffs > observed_diff) +

np.sum(diffs < -observed_diff))/float(num_samples)

return pval, observed_diff, diffs

if __name__ == ’__main__’:

# make up some data

case = [94, 38, 23, 197, 99, 16, 141]

control = [52, 10, 40, 104, 51, 27, 146, 30, 46]

# find p-value by permutation resampling

pval, observed_diff, diffs = \

permutation_resampling(case, control, 10000, np.mean)

# make plots

pylab.title(’Empirical null distribution for differences in mean’)

pylab.hist(diffs, bins=100, histtype=’step’, normed=True)

pylab.axvline(observed_diff, c=’red’, label=’diff’)

pylab.axvline(-observed_diff, c=’green’, label=’-diff’)

pylab.text(60, 0.01, ’p = %.3f’ % pval, fontsize=16)

pylab.legend()

pylab.savefig(’examples/permutation.png’)

#Data visualization (for exploratory data analysis)

import numpy as np

import pylab

xs = np.loadtxt(’anscombe.txt’)

for i in range(4):

x = xs[:,i*2]

y = xs[:,i*2+1]

A = np.vstack([x, np.ones(len(x))]).T

m, c = np.linalg.lstsq(A, y)[0]

pylab.subplot(2,2,i+1)

pylab.scatter(x, y)

pylab.plot(x, m*x+c, ’r’)

pylab.axis([2,20,0,14])

pylab.savefig(’anscombe.png’)

#Working with relational databases (connect, execute, iterate)

import sqlite3

con = sqlite3.connect(’pcfb.sqlite’)

r = con.execute(’select * from people’)

for i in r:

print i

r = con.execute(’select p.name, e.name from people as p join experiment as e where e.researcher == p.id

for i in r:

print ’Name: %s\n\tExperiment: %s’ % (i[0],i[1])

#source code available from git repository. Need to self-compile

#easy_install/pip installation