Extremes in selections from protein libraries

Authors: S├ębastien Boyer, Olivier Rivoire

Reference: S. Boyer, D. Biswas, A. K. Soshee, N. Scaramozzino, C. Nizak, O. Rivoire. Hierarchy and extremes in selections from pools of randomized proteins. PNAS 2016

The following code is in the format of an IPython notebook. It assumes that the data has been processed and formatted into a file (here named S1_PVP.txt and containing the results of the selection of the S1 library against the PVP target as in Figures 3A and 4 of Boyer et al.) with 3 columns separated by tabs containing respectively, a sequence or a label of the sequence, an estimation of the selectivity of this sequence, and an estimation of the error in the value of the selectivity. One parameter, s_star, must be set by hand in cell [6] based on the plots obtained in cell [5].

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats as ss

import warnings
warnings.filterwarnings('ignore')
np.random.seed(1)
In [2]:
# Path to the input file:
data_file = 'S1_PVP.txt'

# Reading the data into dictionaries for selectivities and errors:
sel_dict, err_dict = dict(), dict()
for line in open(data_file , 'r'):
    seq, sel, err = line.split('\t')
    sel_dict[seq] = float(sel)
    err_dict[seq] = float(err)

Selectivity versus rank

In [3]:
# Sorting the data by decreasing values of selectivies:
seq_sorted = sorted(sel_dict, key=lambda s: -sel_dict[s])
sel_sorted = [sel_dict[s] for s in seq_sorted]  
err_sorted = [err_dict[s] for s in seq_sorted]  

plt.rcParams['figure.figsize'] = 5, 5; plt.rc('font', size=16)
plt.loglog(range(len(sel_sorted)), sel_sorted,'or', lw = 2);
plt.xlabel('rank', fontsize=20); plt.ylabel('selectivity', fontsize=20)
plt.axis([0, 1.3*len(sel_sorted), sel_sorted[-1], 1.3*sel_sorted[0]]); plt.grid('on')
In [4]:
def log_likelihood_fct_exp(para, data_sorted):
    ''' Log-likelihood function for the exponential model '''
    tau, mu = para[0], min(data_sorted)
    return -sum([np.log(np.exp(-(x-mu)/tau)/tau) for x in data_sorted[:-1]])

def log_likelihood_fct_evt(para, data_sorted):
    ''' Log-likelihood function for the general model'''
    kappa, tau, mu = float(para[0]), para[1], min(data_sorted)
    return -sum([np.log((1+(x-mu)*(kappa/tau))**(-((kappa+1))/kappa)/tau)\
                 for x in data_sorted[:-1]])

def info_mat_exp(tau, data_sorted):
    ''' Fisher information matrix for the exponential model '''
    mu = min(data_sorted)
    data = [x-mu for x in data_sorted[:-1]]    
    return -1/sum([(tau-2*x)/tau**3 for x in data])

def info_mat_evt(para, data_sorted):
    ''' Fisher information matrix for the general model '''
    matrix = np.zeros((2,2))
    kappa, tau, mu = para[0], para[1], min(data_sorted)
    data = [x-mu for x in data_sorted[:-1]]    
    matrix[0][0] = -sum([(-kappa*x**2+tau**2-2*tau*x)/(tau*(kappa*x+tau))**2 for x in data])
    matrix[0][1] = -sum([x*(tau-x)/(tau*(kappa*x+tau)**2) for x in data])
    matrix[1][0] = -sum([x*(tau-x)/(tau*(kappa*x+tau)**2) for x in data])
    matrix[1][1] = -sum([(kappa*x*(kappa*(kappa+3)*x +2*tau)\
        -2*(kappa*x+tau)**2*np.log(1+kappa*x/tau))/(kappa**3*(kappa*x+tau)**2) for x in data])
    return np.linalg.inv(matrix)

def threshold_scan(sel_sorted, min_points, para=[1,0.001]):
    ''' Fit to the general model for different values of threshold 
    min_points sets the minimum number of points that are kept '''
    para_list, err_list = list(), list()
    for i in range(len(sel_sorted), min_points, -1):
        para = scipy.optimize.fmin(log_likelihood_fct_evt, para,\
                                   args=(sel_sorted[:i],), disp=False, maxiter=1000)
        para_list.append(para)
        err = info_mat_evt(para, sel_sorted[:i])    
        err_list.append([1.96*np.sqrt(err[1][1]), 1.96*np.sqrt(err[0][0])])
    return para_list, err_list
In [5]:
min_points = 50
para_list, err_list = threshold_scan(sel_sorted, min_points)

plt.rcParams['figure.figsize'] = 12, 5; plt.rc('font', size=12)
plt.subplot(121)
plt.errorbar(sel_sorted[min_points:][::-1],[p[0] for p in para_list],\
             [e[0] for e in err_list],fmt='k.',linewidth=3)
plt.plot(sel_sorted[min_points:][::-1],[p[0] for p in para_list],'go',markersize=8)
plt.xlabel(r'threshold $s^*$',fontsize=20); plt.ylabel(r'estimated $\kappa$',fontsize=20)
plt.subplot(122)
plt.errorbar(sel_sorted[min_points:][::-1],[p[1] for p in para_list],\
             [e[1] for e in err_list],fmt='k.',linewidth=3)
plt.plot(sel_sorted[min_points:][::-1],[p[1] for p in para_list],'go',markersize=8)
plt.xlabel(r'threshold $s^*$',fontsize=20); plt.ylabel(r'estimated $\tau$',fontsize=20)
plt.tight_layout();
In [6]:
# Choice of the treshold based on previous plots:
s_star = .0004

Parameters of the best fit

In [7]:
# Truncation of the data given s_star:
sel_trunc = [s for s in sel_sorted if s > s_star]
mu = min(sel_trunc)
N_samples = len(sel_trunc)

# Fit to an exponential model (1 parameter):
print 'Exponential fit:'
para = scipy.optimize.fmin(log_likelihood_fct_exp,[.01],args=(sel_trunc,), maxiter=10000)
tauexp = para[0]
print 'tau_exp = %.5f\n' % tauexp

# Fit to a general model (2 parameters):
print 'General fit:'
para = scipy.optimize.fmin(log_likelihood_fct_evt, [.5,.01], args=(sel_trunc,), maxiter=10000)
kappa, tau = para[0], para[1]
print 'kappa = %.3f, tau = %.5f' % (kappa, tau)
Exponential fit:
Optimization terminated successfully.
         Current function value: -1249.910350
         Iterations: 21
         Function evaluations: 42
tau_exp = 0.00028

General fit:
Optimization terminated successfully.
         Current function value: -1267.475534
         Iterations: 49
         Function evaluations: 95
kappa = 0.453, tau = 0.00016

Quality of the fit

In [8]:
x_range = np.linspace(0, 1-1./N_samples, num=N_samples)
sel_model = [((1-x)**(-kappa)-1)/kappa for x in x_range]
sel_pp = [1-(1+kappa*(s-mu)/tau)**(-1/kappa) for s in sel_trunc[::-1]]

plt.rc('font', size=14)
plt.subplot(121); plt.title('Q-Q plot (general model)', fontsize=18); # Q-Q plot
plt.plot([0,max(sel_model)],[mu,mu+tau*max(sel_model)],'k-.',linewidth=2);
plt.errorbar(sel_model, sel_trunc[::-1], yerr=err_sorted[:N_samples][::-1],\
             fmt='b.', markersize=15)
plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20)
plt.subplot(122); plt.title('P-P plot (general model)', fontsize=18); # P-P plot
plt.plot([0,1],[0,1],'k--', lw=2)
plt.plot(sel_pp, x_range ,'r',lw=3)
plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20);
In [9]:
# As above but for the exponential model:
plt.subplot(121); plt.title('Q-Q plot (exponential model)', fontsize=18); # Q-Q plot
sel_model = [-np.log(1-x) for x in x_range]
plt.plot([0,max(sel_model)],[mu,mu+tau*max(sel_model)],'k-.',linewidth=2);
plt.errorbar(sel_model, sel_trunc[::-1], yerr=err_sorted[:N_samples][::-1],\
             fmt='b.', markersize=15);
plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20);
plt.subplot(122); plt.title('P-P plot (exponential model)', fontsize=18); # P-P plot
sel_pp = [1-np.exp(-(s-mu)/tauexp) for s in sel_trunc[::-1]]
plt.plot([0,1],[0,1],'k--', lw=2)
plt.plot(sel_pp, x_range ,'r',lw=3);
plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20);

Significance of $\kappa\neq 0$

In [10]:
def distr_LikeRatioTest(tauexp, N_samples, N_draws):
    ''' Null distribution for the likelihood ratio test '''
    distr = list()
    for n in range(N_draws):
        # draw samples from exp:
        sel_rand = np.random.exponential(tauexp, N_samples)
        # fit them by evt:
        para = scipy.optimize.fmin(log_likelihood_fct_evt, [.5,.01],\
                                   args=(sel_rand,), disp=False, maxiter=1000)
        distr.append(2*(log_likelihood_fct_exp([tauexp], sel_rand)\
                        - log_likelihood_fct_evt(para, sel_rand)))
    return distr
In [11]:
# null distribution:
N_draws = 10000
distr = distr_LikeRatioTest(tauexp, N_samples)
# experimental value:
dL_sel = 2*(log_likelihood_fct_exp([tauexp], sel_trunc)\
            - log_likelihood_fct_evt([kappa,tau], sel_trunc))
# p-value:
p_val = sum([1 for dL in distr if dL > dL_sel])/(1.*len(distr))
if p_val > 0:
    print 'p-value against the exponential model: %.e' % p_val
else:
    print 'p-value against the exponential model: < %.e' % 1./N_draws
p-value against the exponential model: 0e+00
In [30]:
plt.hist(distr, 100, label=r'$\kappa = 0$')
plt.axvline(dL_sel, color='r', lw=3, label='data');
plt.legend(bbox_to_anchor=(1.2,1.03));
plt.xlabel('test statistics', fontsize=20); plt.ylabel('counts', fontsize=20);