Authors: Sébastien Boyer, Olivier Rivoire
Reference: S. Boyer, D. Biswas, A. K. Soshee, N. Scaramozzino, C. Nizak, O. Rivoire. Hierarchy and extremes in selections from pools of randomized proteins. PNAS 2016
The following code is in the format of an IPython notebook. It assumes that the data has been processed and formatted into a file (here named S1_PVP.txt and containing the results of the selection of the S1 library against the PVP target as in Figures 3A and 4 of Boyer et al.) with 3 columns separated by tabs containing respectively, a sequence or a label of the sequence, an estimation of the selectivity of this sequence, and an estimation of the error in the value of the selectivity. One parameter, s_star, must be set by hand in cell [6] based on the plots obtained in cell [5].
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats as ss
import warnings
warnings.filterwarnings('ignore')
np.random.seed(1)
# Path to the input file:
data_file = 'Data/S1_PVP.dat'
# Reading the data into dictionaries for selectivities and errors:
sel_dict, err_dict = dict(), dict()
for line in open(data_file , 'r'):
seq, sel, err = line.split('\t')
sel_dict[seq] = float(sel)
err_dict[seq] = float(err)
# Sorting the data by decreasing values of selectivies:
seq_sorted = sorted(sel_dict, key=lambda s: -sel_dict[s])
sel_sorted = [sel_dict[s] for s in seq_sorted]
err_sorted = [err_dict[s] for s in seq_sorted]
plt.rcParams['figure.figsize'] = 5, 5; plt.rc('font', size=16)
plt.loglog(range(1,len(sel_sorted)+1), sel_sorted,'or', lw = 2);
plt.xlabel('rank', fontsize=20); plt.ylabel('selectivity', fontsize=20)
plt.axis([0, 1.3*len(sel_sorted), sel_sorted[-1], 1.3*sel_sorted[0]]); plt.grid('on')
def log_likelihood_fct_exp(para, data_sorted):
''' Log-likelihood function for the exponential model '''
tau, mu = para[0], min(data_sorted)
return -sum([np.log(np.exp(-(x-mu)/tau)/tau) for x in data_sorted[:-1]])
def log_likelihood_fct_evt(para, data_sorted):
''' Log-likelihood function for the general model'''
kappa, tau, mu = float(para[0]), para[1], min(data_sorted)
return -sum([np.log((1+(x-mu)*(kappa/tau))**(-((kappa+1))/kappa)/tau)\
for x in data_sorted[:-1]])
def info_mat_exp(tau, data_sorted):
''' Fisher information matrix for the exponential model '''
mu = min(data_sorted)
data = [x-mu for x in data_sorted[:-1]]
return -1/sum([(tau-2*x)/tau**3 for x in data])
def info_mat_evt(para, data_sorted):
''' Fisher information matrix for the general model '''
matrix = np.zeros((2,2))
kappa, tau, mu = para[0], para[1], min(data_sorted)
data = [x-mu for x in data_sorted[:-1]]
matrix[0][0] = -sum([(-kappa*x**2+tau**2-2*tau*x)/(tau*(kappa*x+tau))**2 for x in data])
matrix[0][1] = -sum([x*(tau-x)/(tau*(kappa*x+tau)**2) for x in data])
matrix[1][0] = -sum([x*(tau-x)/(tau*(kappa*x+tau)**2) for x in data])
matrix[1][1] = -sum([(kappa*x*(kappa*(kappa+3)*x +2*tau)\
-2*(kappa*x+tau)**2*np.log(1+kappa*x/tau))/(kappa**3*(kappa*x+tau)**2) for x in data])
return np.linalg.inv(matrix)
def threshold_scan(sel_sorted, min_points, para=[1,0.001]):
''' Fit to the general model for different values of threshold
min_points sets the minimum number of points that are kept '''
para_list, err_list = list(), list()
for i in range(len(sel_sorted), min_points, -1):
para = scipy.optimize.fmin(log_likelihood_fct_evt, para,\
args=(sel_sorted[:i],), disp=False, maxiter=1000)
para_list.append(para)
err = info_mat_evt(para, sel_sorted[:i])
err_list.append([1.96*np.sqrt(err[1][1]), 1.96*np.sqrt(err[0][0])])
return para_list, err_list
min_points = 50
para_list, err_list = threshold_scan(sel_sorted, min_points)
plt.rcParams['figure.figsize'] = 12, 5; plt.rc('font', size=12)
plt.subplot(121)
plt.errorbar(sel_sorted[min_points:][::-1],[p[0] for p in para_list],\
[e[0] for e in err_list], fmt='none', color='k')
plt.plot(sel_sorted[min_points:][::-1],[p[0] for p in para_list],'g-', lw=3)
plt.xlabel(r'threshold $s^*$',fontsize=20); plt.ylabel(r'estimated $\kappa$',fontsize=20)
plt.subplot(122)
plt.errorbar(sel_sorted[min_points:][::-1],[p[1] for p in para_list],\
[e[1] for e in err_list],fmt='none', color='k')
plt.plot(sel_sorted[min_points:][::-1],[p[1] for p in para_list],'g-', lw=3)
plt.xlabel(r'threshold $s^*$',fontsize=20); plt.ylabel(r'estimated $\tau$',fontsize=20)
plt.tight_layout();
# Choice of the treshold based on previous plots:
s_star = .0004
# Truncation of the data given s_star:
sel_trunc = [s for s in sel_sorted if s > s_star]
mu = min(sel_trunc)
N_samples = len(sel_trunc)
# Fit to an exponential model (1 parameter):
print 'Exponential fit:'
para = scipy.optimize.fmin(log_likelihood_fct_exp,[.01],args=(sel_trunc,), maxiter=10000)
tauexp = para[0]
print 'tau_exp = %.5f\n' % tauexp
# Fit to a general model (2 parameters):
print 'General fit:'
para = scipy.optimize.fmin(log_likelihood_fct_evt, [.5,.01], args=(sel_trunc,), maxiter=10000)
kappa, tau = para[0], para[1]
print 'kappa = %.3f, tau = %.5f' % (kappa, tau)
x_range = np.linspace(0, 1-1./N_samples, num=N_samples)
sel_model = [((1-x)**(-kappa)-1)/kappa for x in x_range]
sel_pp = [1-(1+kappa*(s-mu)/tau)**(-1/kappa) for s in sel_trunc[::-1]]
plt.rc('font', size=14)
plt.subplot(121); plt.title('Q-Q plot (general model)', fontsize=18); # Q-Q plot
plt.plot([0,max(sel_model)],[mu,mu+tau*max(sel_model)],'k-.',linewidth=2);
plt.errorbar(sel_model, sel_trunc[::-1], yerr=err_sorted[:N_samples][::-1],\
fmt='b.', markersize=15)
plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20)
plt.subplot(122); plt.title('P-P plot (general model)', fontsize=18); # P-P plot
plt.plot([0,1],[0,1],'k--', lw=2)
plt.plot(sel_pp, x_range ,'r',lw=3)
plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20);
# As above but for the exponential model:
plt.subplot(121); plt.title('Q-Q plot (exponential model)', fontsize=18); # Q-Q plot
sel_model = [-np.log(1-x) for x in x_range]
plt.plot([0,max(sel_model)],[mu,mu+tau*max(sel_model)],'k-.',linewidth=2);
plt.errorbar(sel_model, sel_trunc[::-1], yerr=err_sorted[:N_samples][::-1],\
fmt='b.', markersize=15);
plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20);
plt.subplot(122); plt.title('P-P plot (exponential model)', fontsize=18); # P-P plot
sel_pp = [1-np.exp(-(s-mu)/tauexp) for s in sel_trunc[::-1]]
plt.plot([0,1],[0,1],'k--', lw=2)
plt.plot(sel_pp, x_range ,'r',lw=3);
plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20);
def distr_LikeRatioTest(tauexp, N_samples, N_draws):
''' Null distribution for the likelihood ratio test '''
distr = list()
for n in range(N_draws):
# draw samples from exp:
sel_rand = np.random.exponential(tauexp, N_samples)
# fit them by evt:
para = scipy.optimize.fmin(log_likelihood_fct_evt, [.5,.01],\
args=(sel_rand,), disp=False, maxiter=1000)
distr.append(2*(log_likelihood_fct_exp([tauexp], sel_rand)\
- log_likelihood_fct_evt(para, sel_rand)))
return distr
N_draws = 10000
# null distribution:
distr = distr_LikeRatioTest(tauexp, N_samples, N_draws)
# experimental value:
dL_sel = 2*(log_likelihood_fct_exp([tauexp], sel_trunc)\
- log_likelihood_fct_evt([kappa,tau], sel_trunc))
# p-value:
p_val = sum([1 for dL in distr if dL > dL_sel])/(1.*len(distr))
if p_val > 0:
print 'p-value against the exponential model: %.e' % p_val
else:
print 'p-value against the exponential model: < %.e' % (1./N_draws)
plt.hist(distr, 100, label=r'$\kappa = 0$')
plt.axvline(dL_sel, color='r', lw=3, label='data');
plt.legend(bbox_to_anchor=(1.2,1.03));
plt.xlabel('test statistics', fontsize=20); plt.ylabel('counts', fontsize=20);