Source code for xb.simulating

import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import gzip
import shutil
import os.path
from scipy.io import mmread
import tifffile as tf
from scipy.spatial import ConvexHull, convex_hull_plot_2d
from anndata import AnnData
import json
from sklearn.metrics import mutual_info_score
from sklearn.metrics import fowlkes_mallows_score

[docs]def missegmentation_simulation(adata_sc_sub,missegmentation_percentage=0.1): """ Simulate missegmentation using a reference single cell data in adata form. Args: adata_sc_sub (AnnData): AnnData object with the cells of the experiment before simulating the missegmentation missegmentation_percentage (float): percentage of cells (%) that are presenting missegmentation results: adata_sc_sub(AnnData): AnnData object with the cells where missegmentation has been simulated according to input parameters """ exp=adata_sc_sub.to_df() if missegmentation_percentage>0: cells_affected=int(exp.shape[0]*(missegmentation_percentage/100)) for num in tqdm(range(0,cells_affected)): missegmentation_importance=random.sample(range(0,10),1)[0]*0.1 source=random.sample(list(exp.index),1)[0] target=random.sample(list(exp.index),1)[0] exp.loc[target,:]=exp.loc[target,:]+(exp.loc[source,:]*missegmentation_importance) adata_sc_sub.X=np.array(exp.astype(int)) return adata_sc_sub
[docs]def noise_adder(adata_sc,percentage_of_noise=0.1): """ Add noise to a single cell data inputed according to input parameters Args: adata_sc (AnnData): AnnData object with the cells of the experiment before adding noise percentage_of_noise (float): percentage of noise events (%) in relation to the total amounts of cells results: adata_sc(AnnData): AnnData object with the cells where noise has been added """ noise_events=int(np.sum(adata_sc.X)*(percentage_of_noise/100)) for i in range(0,noise_events): x=random.sample(range(0,adata_sc.X.shape[0]),1) y=random.sample(range(0,adata_sc.X.shape[1]),1) change=random.sample([-1,1],1) adata_sc.X[x,y]=adata_sc.X[x,y]+change adata_sc.X[adata_sc.X<-1]=0 return adata_sc
[docs]def subset_of_single_cell(adata_sc_sub,markers,random_markers_percentage=0, reads_x_cell=None,number_of_markers=200, n_reads_x_gene=40,percentage_of_noise=0.1,ms_percentage=0.1): """ Transform a single cell data to present spatial characteristics Args: adata_sc_sub (AnnData): AnnData object with the cells obtained from single cell datasets before transforming them into spatial-like datasets markers (DataFrame): dataframe incluing the main markers identified per cluster per cluster random_markers_percentage (float): percentage of non-marker genes included randomly in the genes selected for the panel reads_x_cell=None n_reads_x_gene (int,None): if int, final number of reads/cells required in the spatial-like datasets. If None, cells are not transformed number_of_markers (int): total number of genes to be included in the simulated dataset. n_reads_x_gene (int): final number of reads/gene required in the spatial-like datasets percentage_of_noise (float): percentage of noise events (%) in relation to the total amounts of cells ms_percentage (float): percentage of cells (%) that are presenting missegmentation results: adata_sc(AnnData): AnnData object with the cells after transfroming them into spatial-like datasets """ mk=[] number_of_markers_x_cluster=int(np.ceil(number_of_markers/markers.shape[1])+1) for ind in markers.index[0:number_of_markers_x_cluster]: for col in markers.columns: mk.append(markers.loc[ind,col]) mk=np.unique(mk) if len(mk)>number_of_markers: mk=random.sample(mk,number_of_markers) sampling_amount=(int(len(np.unique(mk))*(random_markers_percentage/100))) newg=random.sample(list(adata_sc_sub.var.index),sampling_amount) sel=random.sample(range(0,len(mk)),len(newg)) ii=0 for el in sel: mk[el]=newg[ii] ii=ii+1 mk=list(mk) adata_sc=adata_sc_sub[:,adata_sc_sub.var.index.astype(int).isin(mk)].copy() ###THIS STEP MAKES MORE EXPRESSED GENES TO BE LESS DETECTED norm_factors=np.percentile(adata_sc.X,99.9,axis=0)+1 #number of reads per gene (aprox) ##WE HAVE COMENTED THIS# adata_sc.X=np.array(adata_sc.to_df().div(np.array(norm_factors)/n_reads_x_gene,axis=1).astype(int)) # segmentation_simulator adata_sc=missegmentation_simulation(adata_sc,missegmentation_percentage=ms_percentage) ###ADD RANDOM NOISE adata_sc=noise_adder(adata_sc,percentage_of_noise=percentage_of_noise) if reads_x_cell is not None: sc.pp.downsample_counts(adata_sc,counts_per_cell=reads_x_cell) print(adata_sc.shape) return adata_sc
[docs]def entropy(clustering): """ Compute entropy Args: clustering (list): list of clusters assigned to cells. results: entropy_value(float): entropy value computed. """ _, counts = np.unique(clustering, return_counts=True) proportions = counts / len(clustering) entropy_value=-np.sum(proportions * np.log(proportions)) return entropy_value
[docs]def compute_vi(ground_truth, predicted): """ Compute variation of information for comparing two different clusterings Args: ground_truth (list): list of reference clusters given to cells profiled. predicted (list): list of predicted/computed clusters for cells profiled. results: vi_score(float): variation of information. """ mi = mutual_info_score(ground_truth, predicted) h_gt = entropy(ground_truth) h_pred = entropy(predicted) vi_score = h_gt + h_pred - 2 * mi return vi_score
[docs]def compute_fmi(ground_truth, predicted): """ Compute fowlkes mallows index for two different clusterings Args: ground_truth (list): list of reference clusters given to cells profiled. predicted (list): list of predicted/computed clusters for cells profiled. results: fmi_score(float): fowlkes mallows index """ fmi_score = fowlkes_mallows_score(ground_truth, predicted) return fmi_score
[docs]def keep_nuclei_and_quality(adata1,overlaps_nucleus=1,qvmin=20): """ Redefine cell expression based on nuclei expression an quality of detected reads Args: adata1 (AnnData): AnnData object with the cells of the experiment before filtereing reads based on quality or nuclear/non-nuclear. overlaps_nucleus(int): Keep reads overlapping nucleus only (1) or all (2). qvmin(int): Define minimum quality (qv) of reads to keep in the analysis. results: adata1nuc(AnnData): AnnData object with the cells redefined based to input parameters. """ if overlaps_nucleus==1: subset1=adata1.uns['spots'].loc[adata1.uns['spots']['overlaps_nucleus']==overlaps_nucleus,:] if overlaps_nucleus==0: subset1=adata1.uns['spots'] subset1=subset1[subset1['qv']>qvmin] ct1=pd.crosstab(subset1['cell_id'],subset1['feature_name']) adataobs=adata1.obs.loc[adata1.obs['cell_id'].isin(ct1.index),:] av=adata1.var.index[adata1.var.index.isin(ct1.columns)]#.isin(adata1.var.index) ct1=ct1.loc[:,av] adataobs.index=adataobs['cell_id'] adataobs.index.name='ind' ct1=ct1.loc[ct1.index.isin(adataobs['cell_id']),:] adata1nuc=sc.AnnData(np.array(ct1),obs=adataobs)#,var=adata1.var) return adata1nuc
# using a baseline, modify parameters one by one # making it into a function
[docs]def allcombs(adata): """ Simulate preprocessing workflows and extract results based on it Args: adata (AnnData): AnnData object with the cells of the experiment. results: allres(DataFrame): Clustering obtained with different preprocessing workflows. """ default={'nuc':1,'npcs':0,'neighs':16,'qvs':0,'scale':True,'hvgs':False,'ts':100,'norm':True,'lg':True} try: del allres except: print('first run') nucdic={1:'nuc',0:'cell'} neighs=[16,12,6,20] npcs=[0,15,25] ts=[100,10,1000,None] qvs=[0,20,30] scales=[True,False] hvgs=[False,True] norm=[True,False] logs=[True,False] perc=0.1 for qv in qvs: for nuc in [1,0]: if nuc==1: adata_f1=keep_nuclei_and_quality(adata.copy(),overlaps_nucleus=1,qvmin=qv) adata_f1=adata_f1.copy() if nuc==0: adata_f1=keep_nuclei_and_quality(adata.copy(),overlaps_nucleus=0,qvmin=qv) adata_f1.uns['spots']=[] for target_sum in ts: for neigh in neighs: for nm in norm: for npc in npcs: for scale in scales: for lgo in logs: for hvg in hvgs: nv=neigh!=default['neighs'] nv=nv+(nuc!=default['nuc']) nv=nv+(qv!=default['qvs']) nv=nv+(npc!=default['npcs']) nv=nv+(scale!=default['scale']) nv=nv+(hvg!=default['hvgs']) nv=nv+(target_sum!=default['ts']) nv=nv+(nm!=default['norm']) nv=nv+(lgo!=default['lg']) #print(nv) if nv==0: adata_f2=adata_f1.copy() print('DEFAULT') result,ttc=main_preprocessing(adata_f2,neigh=neigh,npc=npc,target_sum=target_sum,scale=scale,hvg=hvg,default=True,norm=nm,lg=lgo) sumresult=result.obs.loc[:,['leiden_1_4','louvain_1_4']] sumresult.columns=['DEFAULT_leid','DEFAULT_louv'] try: allres=pd.merge(allres,sumresult, left_index=True, right_index=True) except: allres=sumresult if nv==1: adata_f2=adata_f1.copy() print(nucdic[nuc]+'_qv'+str(qv)+'_neighs'+str(neigh)+'_npc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)) result=main_preprocessing(adata_f2,neigh=neigh,npc=npc,target_sum=target_sum,scale=scale,hvg=hvg,total_clusters=ttc,norm=nm,lg=lgo) sumresult=result.obs.loc[:,['leiden_1_4','louvain_1_4']] sumresult.columns=[nucdic[nuc]+'_qv'+str(qv)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_norm_'+str(nm)+'_log_'+str(lgo)+'_leid',nucdic[nuc]+'_qv'+str(qv)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_norm_'+str(nm)+'_log_'+str(lgo)+'_louv'] try: allres=pd.merge(allres,sumresult, left_index=True, right_index=True) except: allres=sumresult return allres
[docs]def main_preprocessing(adata,target_sum=100,mincounts=10,mingenes=3,neigh=15,npc=0,nuc=1,scale=False,hvg=False,default=False,total_clusters=30,default_resol=1.6,logstatus=True,normstatus=True): """ preprocess and cluster cells in an Anndata object given some input parameters Args: adata(AnnData): AnnData object with the cells of the experiment before simulating the missegmentation. target_sum(int or None): Target sum to use if the normalization is done based on library size. None is used for automatic calculation of library size. mincounts (int): Minimum amount of counts detected in a cell to pass the quality filters. mingenes (int): Minimum amount of genes expressed in a cell to pass the quality filters. neigh(int): number of neighbors to used when calculating the nearest neighbors by sc.pp.neighbors(). npc(int): number of principal components to used when calculating the nearest neighbors by sc.pp.neighbors(). nuc(int): wether to use only nuclear reads (1) or all reads (0). scale(boolean): whether to scale the data or not. hvg(boolean): whether to select highly variable genes for further processing or not. default(boolean): whether the run is the original one or not. total_clusters (int): number of clusters to obtain in the process of clustering (+-2). default_resol(float): clustering resolution to use as a default when clustering. logstatus(boolean): Whether to log-transforms cells. normstatus(boolean): Whether to normalize based cells or not. results: adata(AnnData): AnnData object after preprocessing and clustering. """ adata.layers['raw']=adata.X.copy() sc.pp.filter_cells(adata,min_counts=mincounts) sc.pp.filter_cells(adata,min_genes=mingenes) adata.raw=adata adata.layers['raw']=adata.X.copy() if normstatus=='pearson': sc.experimental.pp.normalize_pearson_residuals(adata) else: if normstatus==True: sc.pp.normalize_total(adata, target_sum=target_sum) if logstatus==True: sc.pp.log1p(adata) if hvg==True: sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=12, min_disp=0.25) adata = adata[:, adata.var.highly_variable] if scale==True: sc.pp.scale(adata) #remove nans, inf and -ind adata.X=np.array(adata.to_df().fillna(0)) adata.X[adata.X==-np.inf]=0 adata.X[adata.X==np.inf]=0 # adata.X.fillna(0, inplace=True) sc.pp.pca(adata) print(adata.shape) sc.pp.neighbors(adata, n_neighbors=neigh, n_pcs=npc) # sc.tl.leiden(adata,resolution=2.2,key_added='leiden_2_2') resol=default_resol sc.tl.leiden(adata,resolution=resol,key_added='leiden_1_4') numclust=int(np.max(adata.obs['leiden_1_4'].astype(int))) targetnum=total_clusters if default==True: targetnum=numclust if abs(numclust-targetnum)>3: i=0 while abs(int(numclust)-int(targetnum))>3: # print('iter '+str(i)) if numclust>targetnum: resol=resol-0.05 if numclust<targetnum: resol=resol+0.05 sc.tl.leiden(adata,resolution=resol,key_added='leiden_1_4') numclust=np.max(adata.obs['leiden_1_4'].astype(int)) i=i+1 if i >100: break resol=default_resol sc.tl.louvain(adata,resolution=resol,key_added='louvain_1_4') numclust=np.max(adata.obs['louvain_1_4'].astype(int)) if abs(numclust-targetnum)>3: i=0 while abs(int(numclust)-int(targetnum))>3: if numclust>targetnum: resol=resol-0.05 if numclust<targetnum: resol=resol+0.05 sc.tl.leiden(adata,resolution=resol,key_added='louvain_1_4') numclust=np.max(adata.obs['louvain_1_4'].astype(int)) i=i+1 if i >100: break if default==True: return adata,targetnum else: return adata
[docs]def allcombs_simulated(adata,default_key='class'): """ Simulate preprocessing workflows and extract results based on it for simulated data Args: adata (AnnData): AnnData object with the cells of the experiment. default_key(str): name of the column in adata.obs where the reference cell types/clusters are stored. results: allres(DataFrame): Clustering obtained with different preprocessing workflows. """ default={'nuc':1,'npcs':0,'neighs':12,'qvs':0,'scale':False,'hvgs':False,'ts':1000000}# default is never done try: del allres except: print('first run') tot=0 total_combs=3000#3000 neighs=[12,16]#12 npcs=[0,20,40] ts=[10,100,1000,None]#1000 qvs=[0] scales=[False,True] hvgs=[False,True]#True log=[True,False]#100,80 norm=[True,False,'pearson']#, perc=0.1 df_resol=0.4 nuc=1 for lg in log: for target_sum in ts: for neigh in neighs: for npc in npcs: for scale in scales: for hvg in hvgs: for nm in norm: if tot>total_combs: print('Broken') break tot=tot+1 print(tot) try: if nm=='pearson': print(str(lg)+str(scale)+str(hvg)+str(ts)) if lg==True and scale==True and hvg==True and target_sum==None: ttc=len(np.unique(adata.obs[default_key])) adata_f1=adata.copy() adata_f2=adata_f1.copy() result=main_preprocessing(adata_f2,neigh=neigh,npc=npc,target_sum=target_sum,scale=scale,hvg=hvg,total_clusters=ttc,default_resol=df_resol,logstatus=lg,normstatus=nm) sumresult=result.obs.loc[:,['leiden_1_4','louvain_1_4']] sumresult.columns=['norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_leid','norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_louvain'] try: allres=pd.merge(allres,sumresult, left_index=True, right_index=True) except: allres=sumresult else: ttc=len(np.unique(adata.obs[default_key])) adata_f1=adata.copy() adata_f2=adata_f1.copy() result=main_preprocessing(adata_f2,neigh=neigh,npc=npc,target_sum=target_sum,scale=scale,hvg=hvg,total_clusters=ttc,default_resol=df_resol,logstatus=lg,normstatus=nm) sumresult=result.obs.loc[:,['leiden_1_4','louvain_1_4']] sumresult.columns=['norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_leid','norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_louvain'] try: allres=pd.merge(allres,sumresult, left_index=True, right_index=True) except: allres=sumresult except: print('norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_leid','lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)) allres['RESULTS']=adata.obs[default_key] return allres