import numpy as np import neo import elephant from quantities import Hz,s import pandas as pd ############## Create weight_change_event array, then binned spike train array, for calculating weight change triggered average ######## def weight_change_events_spikes_calcium(files,simtime,params,warnings): #### This loops over every data file ###### from scipy.signal import find_peaks from elephant import kernels neighbors=params['neighbors'] bins_per_sec=params['bins_per_sec'] samp_rate=params['samp_rate'] length_of_firing_rate_vector=params['length_of_firing_rate_vector'] ca_downsamp=params['ca_downsamp'] ITI=params['ITI'] down_samp=int(samp_rate/bins_per_sec) binned_trains_index=[] trains = [] all_ca_array=[];ca_index=[] # dt = 0.1e-3 (.1 ms) so a stepsize of 100 data points is equivalent to 10 ms, step size of 1000 datapoints = 100 ms t1weight_dist={'mean':[],'std':[],'no change':[],'files':[]} # ## Detect weight change events #stepsize = 1000 weight_at_times = np.arange(0.,simtime,ITI) #first trial starts at time=0.0 weight_at_index = (weight_at_times*samp_rate).astype(np.int64) df=[];wce_df=[] for i,(trial,f) in enumerate(files.items()): weightchangelist=[]; dframelist = [] missing=0 d= np.load(f,mmap_mode='r') print(i, 'analyzing',trial) index_for_weight = (np.abs(d['time'] - 2)).argmin() #1st trial ending weight = weight just prior to next trial; 2 = ITI? t1weight = [d[n][index_for_weight] for n in d.dtype.names if 'plas' in n] t1weight_dist['mean'].append(np.mean(t1weight)) t1weight_dist['std'].append(np.std(t1weight)) t1weight_dist['no change'].append(t1weight.count(1.0)) t1weight_dist['files'].append(trial) #plas_names=[nm for nm in d.dtype.names if 'plas' in nm ] #includes synaptic inputs directly onto dendrites plas_names=[nm for nm in d.dtype.names if 'plas' in nm and 'head' in nm] #plas names may differ for each file if using different seeds extern_names=[nm for nm in plas_names if 'extern1' in nm] for n in plas_names: #print(n,d[n][-1]) endweight = d[n][-1] if '_to_' in n: spine = n.split('_to_')[-1].replace('-','_').strip('plas') stim=True spikecount = len(find_peaks(d[n.rstrip('plas')])[0]) elif 'ampa' in n: spine = '_'.join(n.split('/')[-1].split('_')[1:-1]) stim=False spikecount = 0 weight_change_variance = np.var(np.diff(d[n])) spinedist = np.nan#[v for k2,v in spinedistdict.items() if spine in '_'.join(k2.split('/')[-2:]).replace('[0]','')][0] #print(n,spine,endweight,stimvariability,spinedist) dframelist.append((spine,endweight,trial,spinedist,stim,spikecount,weight_change_variance)) for jj,syn in enumerate(extern_names): weightchange=np.diff(d[syn][weight_at_index]) startingweight = d[syn][weight_at_index] for k,w in enumerate(weightchange): weightchangelist.append((trial,syn,weight_at_times[k+1],startingweight[k],w)) peaks,_ = find_peaks(d[syn.rpartition('plas')[0]]) peaktimes = d['time'][peaks] ### create SpikeTrain for every trial/synapse train = neo.SpikeTrain(peaktimes,t_stop=simtime*s,units=s) trains.append(train) binned_trains_index.append((trial,syn)) #extract calcium traces. Must edit to match naming convention if '-sp' in syn: shellnum=syn.split('_3-')[0].split('_to_')[-1] spnum=syn.split('-sp')[-1].split('headplas')[0] shell='/data/D1_sp'+spnum+shellnum+'_3Shell_0' else: shellnum=syn.split('_to_')[-1].split('_')[0] shell='/data/D1_'+shellnum+'Shell_0' if shell in d.dtype.names: ca_index.append((trial,shell)) #jj=i*len(ca_names)+j all_ca_array.append(d[shell][::ca_downsamp]) else: missing+=1 if missing<warnings: print(missing,syn,shell,' NOT IN FILE',trial) if missing>warnings-1: print(missing, 'extern_names have no calcium conc') df.append(pd.DataFrame(dframelist,columns = ['spine','endweight','trial','spinedist','stim','spikecount','weight_change_variance'])) wce_df.append(pd.DataFrame(weightchangelist,columns = ['trial','synapse','time','startingweight','weightchange'])) alldf = pd.concat(df).reset_index() weight_change_event_df = pd.concat(wce_df).reset_index() ### instantaneous weight array - created from spikeTrains kernel = kernels.GaussianKernel(sigma=100e-3*s) numtrains=len(trains) #use extern_names inst_rate_array = np.zeros((numtrains,int(simtime*bins_per_sec))) for jj,train in enumerate(trains): train.sampling_rate=samp_rate*Hz inst_rate_array[jj,:] = elephant.statistics.instantaneous_rate(train,train.sampling_period,kernel=kernel)[::down_samp,0].flatten() return alldf,weight_change_event_df,inst_rate_array,trains,binned_trains_index,np.array(all_ca_array),ca_index,t1weight_dist def add_spine_soma_dist(wce_df,df, spine_soma_dist_file,warnings=5): if isinstance(spine_soma_dist_file,pd.core.series.Series): spine_soma_dist=spine_soma_dist_file else: spine_soma_dist_df=pd.read_csv(spine_soma_dist_file,index_col=0) spine_soma_dist=spine_soma_dist_df.distance wce_df['spine_soma_dist']=np.nan missing=0 for wce in wce_df.index: syn = wce_df.synapse[wce] spine = syn.split('_to_')[-1].replace('plas','').replace('-sp','_sp') if 'head' in spine: #need to add distance to soma for non-spine plasticity wce_df.loc[wce,'spine_soma_dist']=spine_soma_dist.loc[spine] else: dist=np.mean([spine_soma_dist[i] for i in spine_soma_dist.index if spine in i]) wce_df.loc[wce,'spine_soma_dist']=dist missing+=1 if missing<warnings: print('wce in add_spine_soma_dist',syn,':::',spine,'not spine head, use mean of spine distances for that compartment') if missing>warnings-1: print('wce, add_spine_soma_dist', missing, 'syn are not spine head, use mean of spine distances for that compartment') missing=0 for syn in df.spine: if 'head' in syn: df.loc[df.spine==syn,'spinedist']=1e6*spine_soma_dist.loc[syn] #distance in um else: missing+=1 df.loc[df.spine==syn,'spinedist']=1e6*np.mean([spine_soma_dist[i] for i in spine_soma_dist.index if syn in i]) if missing<warnings: print('df in add_spine_soma_dist',syn,'::: not spine head, use mean of spine distances for that compartment') if missing>warnings-1: print('df, add_spine_soma_dist', missing, 'syn are not spine head, use mean of spine distances for that compartment') return wce_df,df def add_cluster_info(wce_df,df, cluster_file): import pickle f=open(cluster_file,'rb') mydata=pickle.load(f) clusterlist=[[str(m['seed']),m['ClusteringParams']['cluster_length'],m['ClusteringParams']['n_spines_per_cluster']] for m in mydata] clusterdf=pd.DataFrame(clusterlist,columns=['trial','cluster_length','spines_per_cluster']) print('cluster correlations',clusterdf['cluster_length'].corr(clusterdf['spines_per_cluster'])) wce_df=pd.merge(wce_df,clusterdf,on='trial',validate='many_to_one') df=pd.merge(df,clusterdf,on='trial',validate='many_to_one') return wce_df,df def do_random_forest(X_train,y_train,X_test,y_test,n_est=100,feat=''): from sklearn.ensemble import RandomForestRegressor regr = RandomForestRegressor(random_state=0,n_estimators=n_est)#,max_features='sqrt') reg = regr.fit(X_train, y_train) fit = reg.predict(X_train) pred = reg.predict(X_test) print('FEATURES',feat,np.shape(X_train)[1]) print('Training set score: ',reg.score(X_train, y_train), ' Testing Set score: ', reg.score(X_test,y_test)) score= {'train':reg.score(X_train,y_train),'test':reg.score(X_test,y_test)} return reg,fit,pred,score,reg.feature_importances_[:] def downsample_X(Xinput,numbins,weight_change_event_df, num_events,add_start_wt=True): downsamp=np.shape(Xinput)[1]//numbins #Xbins=Xinput[:,::downsamp] #simplest is to downsample by 10 Xbins=np.zeros((np.shape(Xinput)[0],numbins)) #Better is to average across bins for b in range(numbins): Xbins[:,b]=np.mean(Xinput[:,b*downsamp:(b+1)*downsamp],axis=1) feature_labels=[str(b) for b in range(numbins)] if np.std(weight_change_event_df['startingweight'].to_numpy())>0 and add_start_wt: Xbins=np.concatenate((Xbins,weight_change_event_df['startingweight'].to_numpy().reshape(num_events,1)),axis=1) feature_labels.append('start_wt') return Xbins,feature_labels def create_set_of_Xarrays(newX,adjX,numbins,weight_change_event_df,all_cor,num_events,lin=False): Xbins,feat_lbl=downsample_X(newX,numbins,weight_change_event_df,num_events) if lin: Xbins_adj,_=downsample_X(adjX,1,weight_change_event_df,num_events,add_start_wt=False) Xbins_combined=np.concatenate((Xbins,Xbins_adj),axis=1) set_of_Xbins={'':Xbins,'_adj':Xbins_combined} else: Xbins_adj,_=downsample_X(adjX,numbins,weight_change_event_df,num_events,add_start_wt=False) Xbins_combined=np.concatenate((Xbins,Xbins_adj),axis=1) Xbins_dist=np.concatenate((Xbins,weight_change_event_df['spine_soma_dist'].to_numpy().reshape(num_events,1)),axis=1) #Xbins_clust=np.concatenate((Xbins,weight_change_event_df['cluster_length'].to_numpy().reshape(num_events,1), weight_change_event_df['spines_per_cluster'].to_numpy().reshape(num_events,1)),axis=1) Xbins_clust=np.concatenate((Xbins,weight_change_event_df['cluster_length'].to_numpy().reshape(num_events,1)),axis=1) #Xbins_adj_clust=np.concatenate((Xbins_combined,weight_change_event_df['cluster_length'].to_numpy().reshape(num_events,1), weight_change_event_df['spines_per_cluster'].to_numpy().reshape(num_events,1)),axis=1) set_of_Xbins={'':Xbins,'_adj':Xbins_combined,'_dist':Xbins_dist,'_clust':Xbins_clust}#,'_adj_clust':Xbins_adj_clust} return set_of_Xbins,feat_lbl def random_forest_LeaveNout(newX, adjacentX, y, weight_change_event_df,all_cor,bin_set,num_events,trials=3,linear=False): from sklearn.model_selection import train_test_split from sklearn import linear_model feature_types=['','_adj','_dist','_clust']#, '_adj_clust'] reg_score={str(bn)+k :[] for bn in bin_set for k in feature_types} feature_import={str(bn)+k :[] for bn in bin_set for k in feature_types} linreg_score={str(bn)+k :[] for bn in bin_set for k in feature_types} for numbins in bin_set: set_of_Xbins,feat_lbl=create_set_of_Xarrays(newX,adjacentX,numbins,weight_change_event_df,all_cor,num_events,lin=linear) for i in range(trials): for feat in list(set(set_of_Xbins.keys()) & set(feature_types)): X_train, X_test, y_train, y_test = train_test_split(set_of_Xbins[feat], y, test_size=1/trials) if linear: linreg = linear_model.LinearRegression(fit_intercept=True).fit(X_train, y_train) linreg_score[str(numbins)+feat].append({'train':linreg.score(X_train, y_train),'test':linreg.score(X_test, y_test),'coef':linreg.coef_}) reg,fit,pred,regscore,feat_vals=do_random_forest(X_train,y_train,X_test,y_test,feat=feat) reg_score[str(numbins)+feat].append(regscore) feature_import[str(numbins)+feat]=feat_vals adj_labels=['adj'+str(i)+' or other' for i in range(numbins)] feature_import[str(numbins)+'_features']=feat_lbl+adj_labels return reg_score,feature_import,linreg_score def random_forest_variations(newX, adjacentX, y, weight_change_event_df,all_cor,bin_set,num_events,wt_change_title,RF_plots,linear=False): from sklearn.model_selection import train_test_split from sklearn import linear_model from plas_sim_plots import rand_forest_plot if RF_plots: from matplotlib import pyplot as plt reg_score={} feature_import={} linreg_score={} for numbins in bin_set: set_of_Xbins,feat_lbl=create_set_of_Xarrays(newX,adjacentX, numbins,weight_change_event_df,all_cor,num_events,lin=linear) print('###########',numbins,' bins,',' ##############') for feat,Xarray in set_of_Xbins.items(): print(' ##',feat[1:],', features:', np.shape(Xarray)[1], ' ##') X_train, X_test, y_train, y_test = train_test_split(Xarray, y, test_size=0.1, random_state=42) if linear: linreg = linear_model.LinearRegression(fit_intercept=True).fit(X_train, y_train) print('linear score:',linreg.score(X_train, y_train),', coef:',linreg.coef_,linreg.intercept_) #rfe = RFE(estimator=linear_model.LinearRegression(), n_features_to_select=1, step=1) #rfe.fit(X_train, y_train) #ranking = rfe.ranking_ #print('selecting subset of columns',[newX.columns[ranking==i] for i in range(1,11)]) linreg_score[str(numbins)+feat]={'score':linreg.score(X_test, y_test),'coef':linreg.coef_} reg,fit,pred,regscore,feat_vals=do_random_forest(X_train,y_train,X_test,y_test,feat=feat) reg_score[str(numbins)+feat]=regscore feature_import[str(numbins)+feat]=feat_vals if RF_plots: rand_forest_plot(y_train,y_test,fit,pred,str(numbins)+' bins '+feat+wt_change_title) adj_labels=['adj'+str(i)+' or other' for i in range(numbins)] feature_import[str(numbins)+'_features']=feat_lbl+adj_labels if RF_plots: for numbins in bin_set: fimport,ax=plt.subplots() for regname,features in feature_import.items(): if regname.startswith(str(numbins)) and not regname.endswith('features'): ax.plot(features,label=regname+',score='+str(round(reg_score[regname]['test'],3))) ax.set_ylabel('feature important') ax.set_xticks(range(len(feature_import[str(numbins)+'_features']))) ax.set_xticklabels(feature_import[str(numbins)+'_features']) ax.set_xlabel('feature name') ax.legend() return reg_score,feature_import,linreg_score def RF_oob(weight_change_alligned_array,all_cor,weight_change_event_df,y, RF_plots=False): from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from matplotlib import pyplot as plt ################### Try Random Forest with oob_score=True and max_features='sqrt' ########## wce_len=np.shape(weight_change_alligned_array.max(1).T[:,0])[0] X= weight_change_alligned_array.max(1).T[:,0].reshape(wce_len,1) #Xt = combined_neighbors_array.max(0).reshape(wce_len,1) Xt = all_cor.reshape(wce_len,1) X=np.concatenate((X,weight_change_event_df.startingweight.to_numpy().reshape(wce_len,1), Xt, ),axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42) regr = RandomForestRegressor(100,random_state=0,oob_score=True,max_features='sqrt') reg = regr.fit(X_train, y_train) fit = reg.predict(X_train) pred = reg.predict(X_test) if RF_plots: f,axes = plt.subplots(1,2,sharey=True,sharex=True) axes[0].scatter(y_train,fit) xmin=round(min(y_train.min(),y_test.min()),1) xmax=round(max(y_train.max(),y_test.max()),1) ymin=round(min(fit.min(),pred.min()),1) ymax=round(max(fit.max(),pred.max()),1) diagmin=min(xmin,ymin) diagmax=max(ymin,ymax) axes[0].plot([diagmin,diagmax],[diagmin,diagmax]) #reg.score(y_train,fit) axes[1].scatter(y_test,pred) axes[0].plot([diagmin,diagmax],[diagmin,diagmax]) #v = [X.columns[ranking==i] for i in range(1,11)][-5][0] #v = 'spikecount' #axes[0].set_title(v) # axes[0].scatter(X_train[v],fit) # axes[0].scatter(X_train[v],y_train) # axes[1].scatter(X_test[v],pred) # axes[1].scatter(X_test[v],y_test) print('oob regression scores, train:',reg.score(X_train,y_train), 'test:',reg.score(X_test,y_test)) print('oob score',reg.oob_score_) def runClusterAnalysis(param_values, labels, num_features, class_labels,epoch): from sklearn import model_selection from sklearn.ensemble import RandomForestClassifier import operator ############ data is ready for the cluster analysis ################## #select a random subset of data for training, and use the other part for testing #sklearn.model_selection.train_test_split(*arrays, **options) #returns the top max_feat number of features and their weights df_values_train, df_values_test, df_labels_train, df_labels_test = model_selection.train_test_split(param_values, labels, test_size=0.25) train_test = {'train':(df_values_train,df_labels_train), 'test':(df_values_test, df_labels_test)} #number of estimators (n_estim) is number of trees in the forest #This is NOT the number of clusters to be found #max_feat is the number of features to use for classification #Empirical good default value is max_features=sqrt(num_features) for classification tasks max_feat=int(np.ceil(np.sqrt(num_features))) n_estim=20 rtc = RandomForestClassifier(n_estimators=n_estim, max_features=max_feat) #This line actually builds the random forest (does the training) rtc.fit(df_values_train,df_labels_train) ###### EVALUATE THE RESULT #calculate a score, show the confusion matrix predict_dict = {} for nm,(df,labl) in train_test.items(): predict = rtc.predict(df) predict_dict[nm] = predict #evauate the importance of each feature in the classifier #The relative rank (i.e. depth) of a feature used as a decision node in a tree can be used to assess the relative importance of that feature with respect to the predictability of the target variable. feature_order = sorted({feature : importance for feature, importance in zip(list(df_values_train.columns), list(rtc.feature_importances_))}.items(), key=operator.itemgetter(1), reverse=True) return feature_order[0:max_feat],predict_dict,train_test def cluster_analysis(y,X,adjX,numbins,weight_change_event_df,all_cor,epochs,num_events,cs,max_feat=2 ): import operator from sklearn.metrics import confusion_matrix from plas_sim_plots import plotPredictions classes=np.unique(y.values).astype('int') set_of_Xbins,feat_lbl=create_set_of_Xarrays(X,adjX, numbins,weight_change_event_df,all_cor,num_events) collectionBestFeatures={Xinputs:{} for Xinputs in set_of_Xbins.keys()} collectionTopFeatures={Xinputs:{} for Xinputs in set_of_Xbins.keys()} confuse_mat={Xinputs:[] for Xinputs in set_of_Xbins.keys()} for Xinputs,Xarray in set_of_Xbins.items(): if Xinputs=='_corr' or Xinputs=='_dist': feature_labels=feat_lbl+[Xinputs.strip('_')] elif Xinputs=='_corr_dist': feature_labels=feat_lbl+['corr','dist'] elif Xinputs=='_adj': feature_labels=feat_lbl+[str(lbl)+'adj' for lbl in feat_lbl if lbl != 'start_wt'] else: feature_labels=feat_lbl print (Xinputs,feature_labels,np.shape(Xarray)) X_df=pd.DataFrame(Xarray,columns = feature_labels) num_feat=np.shape(Xarray)[1] for epoch in range(0, epochs): features, predict_dict,train_test = runClusterAnalysis(X_df,y.astype('int'),num_feat,classes,epoch) #print(Xinputs,epoch,'test classes', np.unique(train_test['test'][1]), 'predict classes', np.unique(predict_dict['test']), # '\n',confusion_matrix(train_test['test'][1],predict_dict['test'])) confuse_mat[Xinputs].append({'true':np.unique(train_test['test'][1]),'pred':np.unique(predict_dict['test']),'mat': confusion_matrix(train_test['test'][1],predict_dict['test'])}) if epoch<2 and max_feat>0: #plot not working when max_feat=4 plotPredictions(max_feat, train_test, predict_dict, classes, features,'E'+str(epoch)+'_bins'+str(numbins)+Xinputs,cs) for i,(feat, weight) in enumerate(features): #print(str(numbins),Xinputs,'feat=',feat,collectionBestFeatures[str(numbins)][Xinputs]) #print(i,feat,weight) #monitor progress if feat not in collectionBestFeatures[Xinputs]: # How is the weight scaled? caution collectionBestFeatures[Xinputs][feat] = weight else: collectionBestFeatures[Xinputs][feat] += weight #print(collectionBestFeatures[str(numbins)][Xinputs]) f, w = features[0] if f not in collectionTopFeatures[Xinputs]: collectionTopFeatures[Xinputs][f] = 1 else: collectionTopFeatures[Xinputs][f] += 1 listBestFeatures=sorted(collectionBestFeatures[Xinputs].items(),key=operator.itemgetter(1),reverse=True) collectionBestFeatures[Xinputs]=listBestFeatures listTopFeatures=sorted(collectionTopFeatures[Xinputs].items(),key=operator.itemgetter(1),reverse=True) collectionTopFeatures[Xinputs]=listTopFeatures return collectionBestFeatures,collectionTopFeatures,confuse_mat ''' del dframelist,weightchangelist def calcium(files,downsamp): all_ca_array=[];ca_index=[] for i,(tr,f) in enumerate(files.items()): missing=0 d= np.load(f,mmap_mode='r') plas_names=[nm for nm in d.dtype.names if 'plas' in nm] #plas names may differ for each file if using different seeds extern_names=[nm for nm in plas_names if 'extern1' in nm] print(i, 'analyzing calcium for',tr) for j, syn in enumerate(extern_names): shellnum=syn.split('_3-')[0].split('_to_')[-1] spnum=syn.split('-sp')[-1][0] shell='/data/D1_sp'+spnum+shellnum+'_3Shell_0' if shell in d.dtype.names: ca_index.append((tr,shell)) #jj=i*len(ca_names)+j all_ca_array.append(d[shell][::downsamp]) else: missing+=1 print(missing,shell,' NOT IN FILE',tr) return ca_index, np.array(all_ca_array) '''