Extremes in selections from protein libraries

    Authors: S├ębastien Boyer, Olivier Rivoire

    Reference: S. Boyer, D. Biswas, A. K. Soshee, N. Scaramozzino, C. Nizak, O. Rivoire. Hierarchy and extremes in selections from pools of randomized proteins. PNAS 2016

    The following code is in the format of an IPython notebook. It assumes that the data has been processed and formatted into a file (here named S1_PVP.txt and containing the results of the selection of the S1 library against the PVP target as in Figures 3A and 4 of Boyer et al.) with 3 columns separated by tabs containing respectively, a sequence or a label of the sequence, an estimation of the selectivity of this sequence, and an estimation of the error in the value of the selectivity. One parameter, s_star, must be set by hand in cell [6] based on the plots obtained in cell [5].

    In [1]:
    %matplotlib inline
    import matplotlib.pyplot as plt
    import numpy as np
    import scipy
    import scipy.stats as ss
    
    import warnings
    warnings.filterwarnings('ignore')
    np.random.seed(1)
    
    In [2]:
    # Path to the input file:
    data_file = 'Data/S1_PVP.dat'
    
    # Reading the data into dictionaries for selectivities and errors:
    sel_dict, err_dict = dict(), dict()
    for line in open(data_file , 'r'):
        seq, sel, err = line.split('\t')
        sel_dict[seq] = float(sel)
        err_dict[seq] = float(err)
    

    Selectivity versus rank

    In [3]:
    # Sorting the data by decreasing values of selectivies:
    seq_sorted = sorted(sel_dict, key=lambda s: -sel_dict[s])
    sel_sorted = [sel_dict[s] for s in seq_sorted]  
    err_sorted = [err_dict[s] for s in seq_sorted]  
    
    plt.rcParams['figure.figsize'] = 5, 5; plt.rc('font', size=16)
    plt.loglog(range(1,len(sel_sorted)+1), sel_sorted,'or', lw = 2);
    plt.xlabel('rank', fontsize=20); plt.ylabel('selectivity', fontsize=20)
    plt.axis([0, 1.3*len(sel_sorted), sel_sorted[-1], 1.3*sel_sorted[0]]); plt.grid('on')
    
    In [4]:
    def log_likelihood_fct_exp(para, data_sorted):
        ''' Log-likelihood function for the exponential model '''
        tau, mu = para[0], min(data_sorted)
        return -sum([np.log(np.exp(-(x-mu)/tau)/tau) for x in data_sorted[:-1]])
    
    def log_likelihood_fct_evt(para, data_sorted):
        ''' Log-likelihood function for the general model'''
        kappa, tau, mu = float(para[0]), para[1], min(data_sorted)
        return -sum([np.log((1+(x-mu)*(kappa/tau))**(-((kappa+1))/kappa)/tau)\
                     for x in data_sorted[:-1]])
    
    def info_mat_exp(tau, data_sorted):
        ''' Fisher information matrix for the exponential model '''
        mu = min(data_sorted)
        data = [x-mu for x in data_sorted[:-1]]    
        return -1/sum([(tau-2*x)/tau**3 for x in data])
    
    def info_mat_evt(para, data_sorted):
        ''' Fisher information matrix for the general model '''
        matrix = np.zeros((2,2))
        kappa, tau, mu = para[0], para[1], min(data_sorted)
        data = [x-mu for x in data_sorted[:-1]]    
        matrix[0][0] = -sum([(-kappa*x**2+tau**2-2*tau*x)/(tau*(kappa*x+tau))**2 for x in data])
        matrix[0][1] = -sum([x*(tau-x)/(tau*(kappa*x+tau)**2) for x in data])
        matrix[1][0] = -sum([x*(tau-x)/(tau*(kappa*x+tau)**2) for x in data])
        matrix[1][1] = -sum([(kappa*x*(kappa*(kappa+3)*x +2*tau)\
            -2*(kappa*x+tau)**2*np.log(1+kappa*x/tau))/(kappa**3*(kappa*x+tau)**2) for x in data])
        return np.linalg.inv(matrix)
    
    def threshold_scan(sel_sorted, min_points, para=[1,0.001]):
        ''' Fit to the general model for different values of threshold 
        min_points sets the minimum number of points that are kept '''
        para_list, err_list = list(), list()
        for i in range(len(sel_sorted), min_points, -1):
            para = scipy.optimize.fmin(log_likelihood_fct_evt, para,\
                                       args=(sel_sorted[:i],), disp=False, maxiter=1000)
            para_list.append(para)
            err = info_mat_evt(para, sel_sorted[:i])    
            err_list.append([1.96*np.sqrt(err[1][1]), 1.96*np.sqrt(err[0][0])])
        return para_list, err_list
    
    In [5]:
    min_points = 50
    para_list, err_list = threshold_scan(sel_sorted, min_points)
    
    plt.rcParams['figure.figsize'] = 12, 5; plt.rc('font', size=12)
    plt.subplot(121)
    plt.errorbar(sel_sorted[min_points:][::-1],[p[0] for p in para_list],\
                 [e[0] for e in err_list], fmt='none', color='k')
    plt.plot(sel_sorted[min_points:][::-1],[p[0] for p in para_list],'g-', lw=3)
    plt.xlabel(r'threshold $s^*$',fontsize=20); plt.ylabel(r'estimated $\kappa$',fontsize=20)
    plt.subplot(122)
    plt.errorbar(sel_sorted[min_points:][::-1],[p[1] for p in para_list],\
                 [e[1] for e in err_list],fmt='none', color='k')
    plt.plot(sel_sorted[min_points:][::-1],[p[1] for p in para_list],'g-', lw=3)
    plt.xlabel(r'threshold $s^*$',fontsize=20); plt.ylabel(r'estimated $\tau$',fontsize=20)
    plt.tight_layout();
    
    In [6]:
    # Choice of the treshold based on previous plots:
    s_star = .0004
    

    Parameters of the best fit

    In [7]:
    # Truncation of the data given s_star:
    sel_trunc = [s for s in sel_sorted if s > s_star]
    mu = min(sel_trunc)
    N_samples = len(sel_trunc)
    
    # Fit to an exponential model (1 parameter):
    print 'Exponential fit:'
    para = scipy.optimize.fmin(log_likelihood_fct_exp,[.01],args=(sel_trunc,), maxiter=10000)
    tauexp = para[0]
    print 'tau_exp = %.5f\n' % tauexp
    
    # Fit to a general model (2 parameters):
    print 'General fit:'
    para = scipy.optimize.fmin(log_likelihood_fct_evt, [.5,.01], args=(sel_trunc,), maxiter=10000)
    kappa, tau = para[0], para[1]
    print 'kappa = %.3f, tau = %.5f' % (kappa, tau)
    
    Exponential fit:
    Optimization terminated successfully.
             Current function value: -1249.910350
             Iterations: 21
             Function evaluations: 42
    tau_exp = 0.00028
    
    General fit:
    Optimization terminated successfully.
             Current function value: -1267.475534
             Iterations: 49
             Function evaluations: 95
    kappa = 0.453, tau = 0.00016
    

    Quality of the fit

    In [8]:
    x_range = np.linspace(0, 1-1./N_samples, num=N_samples)
    sel_model = [((1-x)**(-kappa)-1)/kappa for x in x_range]
    sel_pp = [1-(1+kappa*(s-mu)/tau)**(-1/kappa) for s in sel_trunc[::-1]]
    
    plt.rc('font', size=14)
    plt.subplot(121); plt.title('Q-Q plot (general model)', fontsize=18); # Q-Q plot
    plt.plot([0,max(sel_model)],[mu,mu+tau*max(sel_model)],'k-.',linewidth=2);
    plt.errorbar(sel_model, sel_trunc[::-1], yerr=err_sorted[:N_samples][::-1],\
                 fmt='b.', markersize=15)
    plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20)
    plt.subplot(122); plt.title('P-P plot (general model)', fontsize=18); # P-P plot
    plt.plot([0,1],[0,1],'k--', lw=2)
    plt.plot(sel_pp, x_range ,'r',lw=3)
    plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20);
    
    In [9]:
    # As above but for the exponential model:
    plt.subplot(121); plt.title('Q-Q plot (exponential model)', fontsize=18); # Q-Q plot
    sel_model = [-np.log(1-x) for x in x_range]
    plt.plot([0,max(sel_model)],[mu,mu+tau*max(sel_model)],'k-.',linewidth=2);
    plt.errorbar(sel_model, sel_trunc[::-1], yerr=err_sorted[:N_samples][::-1],\
                 fmt='b.', markersize=15);
    plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20);
    plt.subplot(122); plt.title('P-P plot (exponential model)', fontsize=18); # P-P plot
    sel_pp = [1-np.exp(-(s-mu)/tauexp) for s in sel_trunc[::-1]]
    plt.plot([0,1],[0,1],'k--', lw=2)
    plt.plot(sel_pp, x_range ,'r',lw=3);
    plt.xlabel('model', fontsize=20); plt.ylabel('data', fontsize=20);
    

    Significance of $\kappa\neq 0$

    In [10]:
    def distr_LikeRatioTest(tauexp, N_samples, N_draws):
        ''' Null distribution for the likelihood ratio test '''
        distr = list()
        for n in range(N_draws):
            # draw samples from exp:
            sel_rand = np.random.exponential(tauexp, N_samples)
            # fit them by evt:
            para = scipy.optimize.fmin(log_likelihood_fct_evt, [.5,.01],\
                                       args=(sel_rand,), disp=False, maxiter=1000)
            distr.append(2*(log_likelihood_fct_exp([tauexp], sel_rand)\
                            - log_likelihood_fct_evt(para, sel_rand)))
        return distr
    
    In [11]:
    N_draws = 10000
    # null distribution:
    distr = distr_LikeRatioTest(tauexp, N_samples, N_draws)
    # experimental value:
    dL_sel = 2*(log_likelihood_fct_exp([tauexp], sel_trunc)\
                - log_likelihood_fct_evt([kappa,tau], sel_trunc))
    # p-value:
    p_val = sum([1 for dL in distr if dL > dL_sel])/(1.*len(distr))
    if p_val > 0:
        print 'p-value against the exponential model: %.e' % p_val
    else:
        print 'p-value against the exponential model: < %.e' % (1./N_draws)
    
    p-value against the exponential model: < 1e-04
    
    In [12]:
    plt.hist(distr, 100, label=r'$\kappa = 0$')
    plt.axvline(dL_sel, color='r', lw=3, label='data');
    plt.legend(bbox_to_anchor=(1.2,1.03));
    plt.xlabel('test statistics', fontsize=20); plt.ylabel('counts', fontsize=20);