# Measures distance between center of mass (COM), nearest residue, and a custom selection to the average phopshate layer of a specified binding leaflet # Measurements have sign accuracy, so the distance is negative if the peptide is below the average phosphate layer of the membrane # Measurements are accurate even if the peptide crosses the PBC boundary # Output is a single data file with a column for each measurement type time series # ASSUMPTION: membrane centered at origin (z-coord), PBC cell is centered at origin, membrane thickness is constant in z-direction over timeseries (take average) # Written by Ky Wildermuth, 11/16. Feel free to edit. Feel free to ask questions: kywildermuth@gmail.com import pandas as pd import numpy as np top_phos_path = "top_phos_coord.dat" bot_phos_path = "bot_phos_coord.dat" prot_path = "prot_coord.dat" prot_min_path = "prot_minmax_coord.dat" prot_custom_path = "custom_coord.dat" PBC_cell_path = "PBC_coord.dat" ### USER DEFINED PARAMETERS ### # define primary interaction leaflet (value can be top or bottom) leaflet = 'top' # can specify end limit if desired, though default is to keep this the same length as num_frame (see below) prebind_limit = 4000 # define the number of dcd num_dcd = 1 #define the number of frames per dcd num_frame = 4000 ########################## # compute and define the theoretical total number of data points per coor file (will be used to check if files have same number of data entries) theo_length = num_dcd*num_frame # confirm valid and correct leaflet choice leaflet_choices = set(['top','bottom']) if leaflet in leaflet_choices: print("primary leaflet is %s" % leaflet) else: print('not a valid leaflet choice. Choose top or bottom') exit() # load the coordinate data files to be used (phosphate and protein coordinates) - only load up to defined prebind_limit tp = pd.read_csv(top_phos_path, delim_whitespace = True, nrows=prebind_limit, header = None, names = ["dyn_num", "tp_x", "tp_y", "tp_z"]) bp = pd.read_csv(bot_phos_path, delim_whitespace = True, nrows=prebind_limit, header = None, names = ["dyn_num", "bp_x", "bp_y", "bp_z"]) prot = pd.read_csv(prot_path, delim_whitespace = True, nrows=prebind_limit, header = None, names = ["dyn_num", "prot_x", "prot_y", "prot_z"]) prot_min = pd.read_csv(prot_min_path, delim_whitespace = True, nrows=prebind_limit, header = None, names = ["dyn_num", "prot_x", "prot_y", "prot_z", "prot_mx", "prot_my", "prot_mz"]) prot_custom = pd.read_csv(prot_custom_path, delim_whitespace = True, nrows=prebind_limit, header = None, names = ["dyn_num", "prot_x", "prot_y", "prot_z"]) PBC = pd.read_csv(PBC_cell_path, delim_whitespace = True, nrows = prebind_limit, header = None, names = ["dyn_num", "cell_x", "cell_y", "cell_z", "ang_x", "ang_y", "ang_z"]) #### PROT COM ### # extract the coordinates from each file to be used (z coord) c1 = tp['tp_z'] c2 = bp['bp_z'] c3 = prot['prot_z'] c4 = PBC['cell_z'] # concatenate the coordinates to be used to one data frame and label the columns defect_cond_dat = list(zip(c1,c2,c3,c4)) df1 = pd.DataFrame(data = defect_cond_dat, columns = ['tp_z', 'bp_z', 'prot_z', 'cell_z']) # compute average z-dimensions for tp, bp, cell tp_avg = df1["tp_z"].mean() bp_avg = df1["bp_z"].mean() p_avg = (abs(tp_avg) + abs(bp_avg))/2 cell_avg = df1["cell_z"].mean() # define PBC_z as the average distance from leaflet surface to PBC boundary PBC_z = ((cell_avg)/2) - p_avg # visual check if assumptions are valid: print(df1.describe()) yes = set(['yes','y', 'ye', '']) no = set(['no','n']) print 'Confirm that cell z-coordinate is constant and is symmetrical about membrane origin [y/n]' choice = raw_input().lower() if choice in yes: print "confirmed" elif choice in no: print('Ending analysis') exit(1) print('PBC_z is %d' % PBC_z) # compute the z-distance between phosphate group of each bilayer and the protein COM df1['top_diff'] = df1['tp_z'] - df1['prot_z'] df1['bot_diff'] = df1['bp_z'] - df1['prot_z'] # take the absolute value of each computation above; the protein is closer to the bilayer with the smaller z-dist df1['top_diff_abs'] = abs(df1['top_diff']) df1['bot_diff_abs'] = abs(df1['bot_diff']) # define a new column, tru_neg, value is -1 if prot_z is less than tp_z and greater than bp_z; otherwise 0 df1['tru_neg'] = 1 df1.loc[(df1['prot_z'] < df1['tp_z']) & (df1['prot_z'] > df1['bp_z']), 'tru_neg'] = -1 # define a new column, tru_diff, value is difference between primary leaflet and peptide, across PBC if applicable df1['tru_diff_com'] = 0 if leaflet == 'top': df1.loc[df1['top_diff_abs'] < df1['bot_diff_abs'], 'tru_diff_com'] = df1['top_diff_abs']*df1['tru_neg'] df1.loc[df1['bot_diff_abs'] < df1['top_diff_abs'], 'tru_diff_com'] = 2*PBC_z - df1['bot_diff_abs']*df1['tru_neg'] elif leaflet == 'bottom': df1.loc[df1['bot_diff_abs'] < df1['top_diff_abs'], 'tru_diff_com'] = df1['bot_diff_abs']*df1['tru_neg'] df1.loc[df1['top_diff_abs'] < df1['bot_diff_abs'], 'tru_diff_com'] = 2*PBC_z - df1['top_diff_abs']*df1['tru_neg'] df2 = pd.concat([df1['tru_diff_com']], axis=1, keys=['tru_diff_com']) #df2.to_csv('noPBC_dist_prot.dat', sep = ' ', index = False, header = False) ### custom COM ### # extract the coordinates from each file to be used (z coord) c1 = tp['tp_z'] c2 = bp['bp_z'] c3 = prot_custom['prot_z'] c4 = PBC['cell_z'] # concatenate the coordinates to be used to one data frame and label the columns defect_cond_dat = list(zip(c1,c2,c3,c4)) df1 = pd.DataFrame(data = defect_cond_dat, columns = ['tp_z', 'bp_z', 'prot_z', 'cell_z']) # remove the brackets from the dataframe (if they exist) df1 = df1.replace( '[\},)]','', regex=True ) # convert to string df1.prot_z = df1.prot_z.astype(float) #print df1[:10] # compute the z-distance between phosphate group of each bilayer and the protein COM df1['top_diff'] = df1['tp_z'] - df1['prot_z'] df1['bot_diff'] = df1['bp_z'] - df1['prot_z'] # take the absolute value of each computation above; the protein is closer to the bilayer with the smaller z-dist df1['top_diff_abs'] = abs(df1['top_diff']) df1['bot_diff_abs'] = abs(df1['bot_diff']) # define a new column, tru_neg, value is -1 if prot_z is less than tp_z and greater than bp_z; otherwise 0 df1['tru_neg'] = 1 df1.loc[(df1['prot_z'] < df1['tp_z']) & (df1['prot_z'] > df1['bp_z']), 'tru_neg'] = -1 # define a new column, tru_diff_custom, value is difference between primary leaflet and peptide, across PBC if applicable df1['tru_diff_custom'] = 0 if leaflet == 'top': df1.loc[df1['top_diff_abs'] < df1['bot_diff_abs'], 'tru_diff_custom'] = df1['top_diff_abs']*df1['tru_neg'] df1.loc[df1['bot_diff_abs'] < df1['top_diff_abs'], 'tru_diff_custom'] = 2*PBC_z - df1['bot_diff_abs']*df1['tru_neg'] elif leaflet == 'bottom': df1.loc[df1['bot_diff_abs'] < df1['top_diff_abs'], 'tru_diff_custom'] = df1['bot_diff_abs']*df1['tru_neg'] df1.loc[df1['top_diff_abs'] < df1['bot_diff_abs'], 'tru_diff_custom'] = 2*PBC_z - df1['top_diff_abs']*df1['tru_neg'] df3 = pd.concat([df1['tru_diff_custom']], axis=1, keys=['tru_diff_custom']) #df3.to_csv('noPBC_dist_custom.dat', sep = ' ', index = False, header = False) ### MIN COM ### # extract the coordinates from each file to be used (z coord) c1 = tp['tp_z'] c2 = bp['bp_z'] c3 = prot_min['prot_z'] c4 = prot_min['prot_mz'] c5 = PBC['cell_z'] # concatenate the coordinates to be used to one data frame and label the columns defect_cond_dat = list(zip(c1,c2,c3,c4,c5)) df1 = pd.DataFrame(data = defect_cond_dat, columns = ['tp_z', 'bp_z', 'prot_z_min', 'prot_z_max', 'cell_z']) # remove the brackets from the dataframe (if they exist) df1 = df1.replace( '[\},)]','', regex=True ) # convert to string df1.prot_z_min= df1.prot_z_min.astype(float) # convert to string df1.prot_z_max= df1.prot_z_max.astype(float) # compute the z-distance between phosphate group of each bilayer and the protein min df1['top_diff_min'] = df1['tp_z'] - df1['prot_z_min'] df1['bot_diff_min'] = df1['bp_z'] - df1['prot_z_min'] # take the absolute value of each computation above; the protein is closer to the bilayer with the smaller z-dist df1['top_diff_abs_min'] = abs(df1['top_diff_min']) df1['bot_diff_abs_min'] = abs(df1['bot_diff_min']) # compute the z-distance between phosphate group of each bilayer and the protein max df1['top_diff_max'] = df1['tp_z'] - df1['prot_z_max'] df1['bot_diff_max'] = df1['bp_z'] - df1['prot_z_max'] # take the absolute value of each computation above; the protein is closer to the bilayer with the smaller z-dist df1['top_diff_abs_max'] = abs(df1['top_diff_max']) df1['bot_diff_abs_max'] = abs(df1['bot_diff_max']) # define a new column, tru_neg, value is -1 if prot_z is less than tp_z and greater than bp_z; otherwise 0 df1['tru_neg_min'] = 1 df1['tru_neg_max'] = 1 df1.loc[(df1['prot_z_min'] < df1['tp_z']) & (df1['prot_z_min'] > df1['bp_z']), 'tru_neg_min'] = -1 df1.loc[(df1['prot_z_max'] < df1['tp_z']) & (df1['prot_z_max'] > df1['bp_z']), 'tru_neg_max'] = -1 df1['tru_diff_min'] = 0 if leaflet == 'top' : df1['top_diff_abs'] = df1['top_diff_abs_min'] df1['bot_diff_abs'] = df1['bot_diff_abs_min'] df1.loc[df1['top_diff_abs'] < df1['bot_diff_abs'], 'tru_diff_min'] = df1['top_diff_abs']*df1['tru_neg_min'] df1.loc[df1['bot_diff_abs'] < df1['top_diff_abs'], 'tru_diff_min'] = 2*PBC_z - df1['bot_diff_abs']*df1['tru_neg_min'] elif leaflet == 'bottom' : df1['top_diff_abs'] = df1['top_diff_abs_max'] df1['bot_diff_abs'] = df1['bot_diff_abs_max'] df1.loc[df1['bot_diff_abs'] < df1['top_diff_abs'], 'tru_diff_min'] = df1['bot_diff_abs']*df1['tru_neg_max'] df1.loc[df1['top_diff_abs'] < df1['bot_diff_abs'], 'tru_diff_min'] = 2*PBC_z - df1['top_diff_abs']*df1['tru_neg_max'] df4 = pd.concat([df1['tru_diff_min']], axis=1, keys=['tru_diff_min']) #df4.to_csv('noPBC_dist_min.dat', sep = ' ', index = False, header = False) l1 = df2['tru_diff_com'] l2 = df3['tru_diff_custom'] l3 = df4['tru_diff_min'] truDiff_all = list(zip(l1,l2,l3)) df1 = pd.DataFrame(data = truDiff_all, columns = ['com', 'custom', 'min']) df1.to_csv('binding_distances.dat', sep = ' ', index = False, header = True)