# determines the frequency of peptide # uses local_defect output from blob script and truDiff coordinates output # Written by Ky Wildermuth, 11/16. Feel free to edit. Feel free to ask questions: kywildermuth@gmail.com import pandas as pd import numpy as np import glob import re #### USER INPUT PARAMETERS ##### # define molecular interaction cutoff in Angstroms (choose large max_cutoff (ex 100) and small min_cutoff (ex -100) if don't want) max_cutoff = 20 min_cutoff = 0 # define the frame at which the analysis ends (typically before binding is initiated) prebind_limit = 1800 # define the frame at which analysis begins (can set to 0 but allows for more control of range) start_limit = 0 # define the number of dcd num_dcd = 1 #define the number of frames per dcd num_frame = 4000 # define the defect area (d) and movement (m) threshold ranges and intervals. d is measured in Ang^2 and m in Ang. d_thresh_arr = range(0,301) m_thresh_arr = np.arange(0,5,0.1) ###################### m_thresh_length = len(m_thresh_arr) # compute and define the theoretical total number of data points per coor file (will be used to check if files have same number of data entries) theo_length = num_dcd*num_frame # load the data files dist_path = "binding_distances.dat" freq_path = "local_defect_top.dat" # define number of rows, numrows numrows = prebind_limit - start_limit # open data files for line check dist = open(dist_path) freq = open(freq_path) files = [freq, dist] # define a natural sort function type for numerical file combination def natural_key(string_): return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)] #check to make sure data files have the same number of lines def file_len(fname): with fname as f: for i, l in enumerate(f): pass return i + 1 for fname in files: length = file_len(fname) print fname,",", length yes = set(['yes','y', 'ye', '']) no = set(['no','n']) print 'Confirm the coordinate files all have a length of', theo_length, '[y/n]' choice = raw_input().lower() if choice in yes: print "confirmed" elif choice in no: print('Check input files and try again') exit(1) # loop through movement threshold array m_count = start_limit for m_thresh in m_thresh_arr: m_count += 1 print('%d of %d' % (m_count, m_thresh_length)) stat_sum_df_path = "%d_primary_local.dat" % m_count # load the coordinate data files to be used (phosphate and protein coordinates) - only load up to defined prebind_limit dist_dat = pd.read_csv(dist_path, delim_whitespace = True, skiprows=start_limit, nrows=numrows, header = True, names = ["com", "custom", "min"]) # extract the coordinates from each file to be used (z coord) c1 = dist_dat['min'] # concatenate the coordinates to be used to one data frame and label the columns defect_cond_dat = list(c1) df1 = pd.DataFrame(data = defect_cond_dat, columns = ['dist']) # define a new column, dist. Value is +1 if distance between peptide and phosphate layer is less than or equal to cutoff distance, otherwise dist is 0. df1['sample_space'] = 0 df1.loc[(df1['dist'] <= max_cutoff) & (df1['dist'] >= min_cutoff), 'sample_space'] = 1 # define a new column, dir. value is +1 if peptide moves towards primary leaflet, -1 if moves away. # maybe want to make it 0 if dist is less than 0 (because then in the membrane, and net movement isn't very interesting) df1['dir'] = -1 df1.loc[df1['dist'].shift(-1) < df1['dist'], 'dir'] = 1 # define a new column, dDiff. Value is magnitude of net z-motion from current frame n, to next frame n+1, 0 if cross PBC # note that a negative dDiff value means that the peptide moves toward the membrane df1['dDiff'] = df1['dist'].shift(-1) - df1['dist'] # define new column, M. value is +1 if and only if peptide moves toward primary leaflet by an interval more than m_thresh AND starts within cutoff range. # Value is -1 if doesnt meet threshold but still within cutoff range # move towards membrane if dDiff is negative, thus if dDiff*(-1) > threshold, this means the peptide move towards membrane by at least the threshold amount df1['M'] = 0 df1.loc[(df1['dDiff']*(-1) > m_thresh), 'M'] = 1 df1.loc[(df1['dDiff']*(-1) < m_thresh), 'M'] = -1 df1.loc[df1['sample_space'] != 1, 'M'] = 0 # if using frequency data, load the freq data files to be used #defect = pd.read_csv(freq_path, delim_whitespace = True, header = True, names = ["dyn_num", "defect_area", "comp_val"]) # if using local defect data, load the data file to be used defect = pd.read_csv(freq_path, delim_whitespace = True, header = False, names = ['dyn_num', 'defect_area', 'protein_area', 'comp_val']) #print(defect[:10]) df1['max_defect'] = 0 # define var frame_num, starting at start_limit frame frame_num = start_limit + 1 for index, row in df1.iterrows(): #print(frame_num) # make a set of all of the defects for the given frame number frame_defects = defect[defect['dyn_num'] == frame_num] # define the max defect area from above set # print a list of max defects and compare timeseries to local defect max_defect_area = frame_defects['defect_area'].max() #print(max_defect_area) # insert max defect area of binding surface in max_defect column df1.set_value(index, 'max_defect', max_defect_area) frame_num = frame_num + 1 # build columns of new dataframe to write statistical results to # can clean this up with a loop thresh_d_s = pd.Series(666, index=d_thresh_arr) thresh_m_s = pd.Series(m_thresh, index=d_thresh_arr) M_s = pd.Series(666, index=d_thresh_arr) D_s = pd.Series(666, index=d_thresh_arr) MD_s = pd.Series(666, index=d_thresh_arr) sample_space_s = pd.Series(666, index=d_thresh_arr) # loop over threshold values and conduct D stats for d_thresh in d_thresh_arr: D = 'D_%s' % d_thresh MD = 'MD_m%s_d%s' % (m_thresh, d_thresh) # define new variable D. Value is 1 if max defect is larger than current defect threshold and event is in sample space df1[D] = 0 df1.loc[(df1['max_defect'] >= d_thresh) & (df1['sample_space'] == 1), D] = 1 # define new variable MD. Value is 1 if and only if M is true and D is true df1[MD] = 0 df1.loc[(df1['M'] == 1) & (df1[D] == 1) , MD] = 1 #df1.fillna(0, inplace=True) # M_one_count is number of events that net movement of peptide was towards membrane AND meets distance threshold try: M_count = len(df1.groupby(['M']).groups[1]) except KeyError: M_count = 0 # D_count is number of events that max defect area is larger than or equal to defect area threshold, restricting to those events in sample space try: D_count = len(df1.groupby([D]).groups[1]) except KeyError: D_count = 0 # MD_count is number of events that both M and D_ss are 1 try: MD_count = len(df1.groupby([MD]).groups[1]) except KeyError: MD_count = 0 #sample_space is number of events that do not cross PBC boundary and/or begin within cutoff distance try: sample_space_count = len(df1.groupby(['sample_space']).groups[1]) except KeyError: sample_space_count = 0 thresh_d_s.set_value(d_thresh, d_thresh) M_s.set_value(d_thresh, M_count) D_s.set_value(d_thresh, D_count) MD_s.set_value(d_thresh, MD_count) sample_space_s.set_value(d_thresh, sample_space_count) # concatenate the coordinates to be used to one data frame and label the columns stat_sum = list(zip(thresh_d_s, thresh_m_s, sample_space_s, M_s, D_s, MD_s)) df2 = pd.DataFrame(data = stat_sum, columns = ['d_thresh','m_thresh','sample_space', 'M', 'D','MD']) df2.to_csv(stat_sum_df_path, sep = ' ') # remove header if it is not the first file if m_count != 1: with open(stat_sum_df_path, 'r') as fin: data = fin.read().splitlines(True) with open(stat_sum_df_path, 'w') as fout: fout.writelines(data[1:]) # combine all primary-surface-only files into one read_files_1 = glob.glob("*primary_local.dat") read_files_1 read_files = sorted(read_files_1, key=natural_key) read_files with open("freq_stat_total.dat", "wb") as outfile: for f in read_files: with open(f, "rb") as infile: outfile.write(infile.read())