# determines the frequency of peptide 
# uses local_defect output from blob script and truDiff coordinates output 
# Written by Ky Wildermuth, 11/16. Feel free to edit. Feel free to ask questions: kywildermuth@gmail.com


import pandas as pd
import numpy as np
import glob
import re 


#### USER INPUT PARAMETERS #####
# define molecular interaction cutoff in Angstroms (choose large max_cutoff (ex 100) and small min_cutoff (ex -100) if don't want)
max_cutoff = 20
min_cutoff = 0
# define the frame at which the analysis ends (typically before binding is initiated)
prebind_limit = 1800 
# define the frame at which analysis begins (can set to 0 but allows for more control of range)
start_limit = 0
# define the number of dcd 
num_dcd = 1
#define the number of frames per dcd
num_frame = 4000
# define the defect area (d) and movement (m) threshold ranges and intervals. d is measured in Ang^2 and m in Ang. 
d_thresh_arr = range(0,301)
m_thresh_arr = np.arange(0,5,0.1)

######################


m_thresh_length = len(m_thresh_arr)
# compute and define the theoretical total number of data points per coor file (will be used to check if files have same number of data entries)
theo_length = num_dcd*num_frame

# load the data files
dist_path = "binding_distances.dat"
freq_path = "local_defect_top.dat"

# define number of rows, numrows
numrows = prebind_limit - start_limit

# open data files for line check
dist = open(dist_path)
freq = open(freq_path)


files = [freq, dist]

# define a natural sort function type for numerical file combination
def natural_key(string_):
	return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]

#check to make sure data files have the same number of lines
def file_len(fname):
	with fname as f:
	    for i, l in enumerate(f):
	        pass
	return i + 1

for fname in files:
	length = file_len(fname)
	print fname,",", length

yes = set(['yes','y', 'ye', ''])
no = set(['no','n'])
print 'Confirm the coordinate files all have a length of', theo_length, '[y/n]'
choice = raw_input().lower()
if choice in yes:
   print "confirmed"
elif choice in no:
    print('Check input files and try again')
    exit(1)

# loop through movement threshold array
m_count = start_limit
for m_thresh in m_thresh_arr:
	m_count += 1
	print('%d of %d' % (m_count, m_thresh_length))
	stat_sum_df_path = "%d_primary_local.dat" % m_count

	# load the coordinate data files to be used (phosphate and protein coordinates)  - only load up to defined prebind_limit
	dist_dat = pd.read_csv(dist_path, delim_whitespace = True, skiprows=start_limit, nrows=numrows, header = True, names = ["com", "custom", "min"])


	# extract the coordinates from each file to be used (z coord)
	c1 = dist_dat['min']

	# concatenate the coordinates to be used to one data frame and label the columns
	defect_cond_dat = list(c1)
	df1 = pd.DataFrame(data = defect_cond_dat, columns = ['dist'])

	# define a new column, dist. Value is +1 if distance between peptide and phosphate layer is less than or equal to cutoff distance, otherwise dist is 0. 
	df1['sample_space'] = 0
	df1.loc[(df1['dist'] <= max_cutoff) & (df1['dist'] >= min_cutoff), 'sample_space'] = 1

	# define a new column, dir. value is +1 if peptide moves towards primary leaflet, -1 if moves away. 
	# maybe want to make it 0 if dist is less than 0 (because then in the membrane, and net movement isn't very interesting)
	df1['dir'] = -1
	df1.loc[df1['dist'].shift(-1) < df1['dist'], 'dir'] = 1

	# define a new column, dDiff. Value is magnitude of net z-motion from current frame n, to next frame n+1, 0 if cross PBC
	# note that a negative dDiff value means that the peptide moves toward the membrane
	df1['dDiff'] = df1['dist'].shift(-1) - df1['dist'] 

	# define new column, M. value is +1 if and only if peptide moves toward primary leaflet by an interval more than m_thresh AND starts within cutoff range. 
	# Value is -1 if doesnt meet threshold but still within cutoff range
	# move towards membrane if dDiff is negative, thus if dDiff*(-1) > threshold, this means the peptide move towards membrane by at least the threshold amount
	df1['M'] = 0
	df1.loc[(df1['dDiff']*(-1) > m_thresh), 'M'] = 1
	df1.loc[(df1['dDiff']*(-1) < m_thresh), 'M'] = -1
	df1.loc[df1['sample_space'] != 1, 'M'] = 0

	# if using frequency data, load the freq data files to be used
	#defect = pd.read_csv(freq_path, delim_whitespace = True, header = True, names = ["dyn_num", "defect_area", "comp_val"])

	# if using local defect data, load the data file to be used
	defect = pd.read_csv(freq_path, delim_whitespace = True, header = False, names = ['dyn_num', 'defect_area', 'protein_area', 'comp_val'])
	#print(defect[:10])
	df1['max_defect'] = 0
	
	# define var frame_num, starting at start_limit frame
	frame_num = start_limit + 1
	for index, row in df1.iterrows():
		#print(frame_num)
		# make a set of all of the defects for the given frame number
		frame_defects = defect[defect['dyn_num'] == frame_num]
		# define the max defect area from above set
		# print a list of max defects and compare timeseries to local defect
		max_defect_area = frame_defects['defect_area'].max()
		#print(max_defect_area)
		# insert max defect area of binding surface in max_defect column
		df1.set_value(index, 'max_defect', max_defect_area)
		frame_num = frame_num + 1


	# build columns of new dataframe to write statistical results to
	# can clean this up with a loop
	thresh_d_s = pd.Series(666, index=d_thresh_arr)
	thresh_m_s = pd.Series(m_thresh, index=d_thresh_arr)
	M_s = pd.Series(666, index=d_thresh_arr)
	D_s = pd.Series(666, index=d_thresh_arr)
	MD_s = pd.Series(666, index=d_thresh_arr)
	sample_space_s = pd.Series(666, index=d_thresh_arr)


	# loop over threshold values and conduct D stats
	for d_thresh in d_thresh_arr:
		D = 'D_%s' % d_thresh
		MD = 'MD_m%s_d%s' % (m_thresh, d_thresh)

		# define new variable D. Value is 1 if max defect is larger than current defect threshold and event is in sample space
		df1[D] = 0
		df1.loc[(df1['max_defect'] >= d_thresh) & (df1['sample_space'] == 1), D] = 1

		# define new variable MD. Value is 1 if and only if M is true and D is true
		df1[MD] = 0
		df1.loc[(df1['M'] == 1) & (df1[D] == 1) , MD] = 1		

		#df1.fillna(0, inplace=True)

		# M_one_count is number of events that net movement of peptide was towards membrane AND meets distance threshold
		try:
			M_count = len(df1.groupby(['M']).groups[1])
		except KeyError:
			M_count = 0			
		# D_count is number of events that max defect area is larger than or equal to defect area threshold, restricting to those events in sample space
		try:	
			D_count = len(df1.groupby([D]).groups[1])
		except KeyError:
			D_count = 0	
		# MD_count is number of events that both M and D_ss are 1
		try:	
			MD_count = len(df1.groupby([MD]).groups[1])
		except KeyError:
			MD_count = 0				
		#sample_space is number of events that do not cross PBC boundary and/or begin within cutoff distance
		try:
			sample_space_count = len(df1.groupby(['sample_space']).groups[1])
		except KeyError:
			sample_space_count = 0	
					
		thresh_d_s.set_value(d_thresh, d_thresh)
		M_s.set_value(d_thresh, M_count)
		D_s.set_value(d_thresh, D_count)
		MD_s.set_value(d_thresh, MD_count)
		sample_space_s.set_value(d_thresh, sample_space_count)
	

	# concatenate the coordinates to be used to one data frame and label the columns
	stat_sum = list(zip(thresh_d_s, thresh_m_s, sample_space_s, M_s, D_s, MD_s))
	df2 = pd.DataFrame(data = stat_sum, columns = ['d_thresh','m_thresh','sample_space', 'M', 'D','MD'])
	df2.to_csv(stat_sum_df_path, sep = ' ')

	# remove header if it is not the first file
	if m_count != 1:
		with open(stat_sum_df_path, 'r') as fin:
			data = fin.read().splitlines(True)
		with open(stat_sum_df_path, 'w') as fout:
			fout.writelines(data[1:])	

# combine all primary-surface-only files into one 
read_files_1 = glob.glob("*primary_local.dat")
read_files_1
read_files = sorted(read_files_1, key=natural_key)
read_files
with open("freq_stat_total.dat", "wb") as outfile:
	for f in read_files:
		with open(f, "rb") as infile:
			outfile.write(infile.read())