Source code for xb.simulating

import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import gzip
import shutil
import os.path
from scipy.io import mmread
import tifffile as tf
from scipy.spatial import ConvexHull, convex_hull_plot_2d
from anndata import AnnData
import json
from sklearn.metrics import mutual_info_score
from sklearn.metrics import fowlkes_mallows_score

[docs]def missegmentation_simulation(adata_sc_sub,missegmentation_percentage=0.1):
    """ Simulate missegmentation using a reference single cell data in adata form. 

    Args:
        adata_sc_sub (AnnData): AnnData object with the cells of the experiment before simulating the missegmentation   
        missegmentation_percentage (float): percentage of cells (%) that are presenting missegmentation

    results:
        adata_sc_sub(AnnData): AnnData object with the cells where missegmentation has been simulated according to input parameters
    """

    exp=adata_sc_sub.to_df()
    if missegmentation_percentage>0:
        cells_affected=int(exp.shape[0]*(missegmentation_percentage/100))
        for num in tqdm(range(0,cells_affected)):
            missegmentation_importance=random.sample(range(0,10),1)[0]*0.1
            source=random.sample(list(exp.index),1)[0]
            target=random.sample(list(exp.index),1)[0]
            exp.loc[target,:]=exp.loc[target,:]+(exp.loc[source,:]*missegmentation_importance)
        adata_sc_sub.X=np.array(exp.astype(int))
    return adata_sc_sub

[docs]def noise_adder(adata_sc,percentage_of_noise=0.1):
    """ Add noise to a single cell data inputed according to input parameters 

    Args:
        adata_sc (AnnData): AnnData object with the cells of the experiment before adding noise
        percentage_of_noise (float): percentage of noise events (%) in relation to the total amounts of cells

    results:
        adata_sc(AnnData): AnnData object with the cells where noise has been added
    """

    noise_events=int(np.sum(adata_sc.X)*(percentage_of_noise/100))
    for i in range(0,noise_events):
        x=random.sample(range(0,adata_sc.X.shape[0]),1)
        y=random.sample(range(0,adata_sc.X.shape[1]),1)
        change=random.sample([-1,1],1)
        adata_sc.X[x,y]=adata_sc.X[x,y]+change
    adata_sc.X[adata_sc.X<-1]=0
    return adata_sc

[docs]def subset_of_single_cell(adata_sc_sub,markers,random_markers_percentage=0,
                          reads_x_cell=None,number_of_markers=200,
                         n_reads_x_gene=40,percentage_of_noise=0.1,ms_percentage=0.1):
    """ Transform a single cell data to present spatial characteristics

    Args:
        adata_sc_sub (AnnData): AnnData object with the cells obtained from single cell datasets before transforming them into spatial-like datasets  
        markers (DataFrame): dataframe incluing the main markers identified per cluster per cluster
        random_markers_percentage (float): percentage of non-marker genes included randomly in the genes selected for the panel
        reads_x_cell=None
        n_reads_x_gene (int,None): if int, final number of reads/cells required in the spatial-like datasets. If None, cells are not transformed
        number_of_markers (int): total number of genes to be included in the simulated dataset.
        n_reads_x_gene (int): final number of reads/gene required in the spatial-like datasets
        percentage_of_noise (float): percentage of noise events (%) in relation to the total amounts of cells
        ms_percentage (float): percentage of cells (%) that are presenting missegmentation

    results:
        adata_sc(AnnData): AnnData object with the cells after transfroming them into spatial-like datasets
    """
                             
    mk=[]
    number_of_markers_x_cluster=int(np.ceil(number_of_markers/markers.shape[1])+1)
    for ind in  markers.index[0:number_of_markers_x_cluster]:
        for col in markers.columns:
            mk.append(markers.loc[ind,col])
    mk=np.unique(mk)
    if len(mk)>number_of_markers:
        mk=random.sample(mk,number_of_markers)
    
    sampling_amount=(int(len(np.unique(mk))*(random_markers_percentage/100)))
    newg=random.sample(list(adata_sc_sub.var.index),sampling_amount)
    sel=random.sample(range(0,len(mk)),len(newg))
    ii=0
    for el in sel:
        mk[el]=newg[ii]
        ii=ii+1
    mk=list(mk)
    adata_sc=adata_sc_sub[:,adata_sc_sub.var.index.astype(int).isin(mk)].copy()
     ###THIS STEP MAKES MORE EXPRESSED GENES TO BE LESS DETECTED 
    norm_factors=np.percentile(adata_sc.X,99.9,axis=0)+1
    
    #number of reads per gene (aprox)
   ##WE HAVE COMENTED THIS# adata_sc.X=np.array(adata_sc.to_df().div(np.array(norm_factors)/n_reads_x_gene,axis=1).astype(int))
    # segmentation_simulator
    adata_sc=missegmentation_simulation(adata_sc,missegmentation_percentage=ms_percentage)
    ###ADD RANDOM NOISE
    adata_sc=noise_adder(adata_sc,percentage_of_noise=percentage_of_noise)
    if reads_x_cell is not None:
        sc.pp.downsample_counts(adata_sc,counts_per_cell=reads_x_cell)
    print(adata_sc.shape)
    return adata_sc

[docs]def entropy(clustering):
    """ Compute entropy

    Args:
        clustering (list): list of clusters assigned to cells.

    results:
        entropy_value(float): entropy value computed.
    """
    
    _, counts = np.unique(clustering, return_counts=True)
    proportions = counts / len(clustering)
    entropy_value=-np.sum(proportions * np.log(proportions))
    return entropy_value

[docs]def compute_vi(ground_truth, predicted):
    """ Compute variation of information for comparing two different clusterings

    Args:
        ground_truth (list): list of reference clusters given to cells profiled.

        predicted (list): list of predicted/computed clusters for cells profiled.

    results:
        vi_score(float): variation of information.
    """    
    
    mi = mutual_info_score(ground_truth, predicted)
    h_gt = entropy(ground_truth)
    h_pred = entropy(predicted)
    vi_score = h_gt + h_pred - 2 * mi
    return vi_score



[docs]def compute_fmi(ground_truth, predicted):
    """ Compute fowlkes mallows index for two different clusterings

    Args:
        ground_truth (list): list of reference clusters given to cells profiled.

        predicted (list): list of predicted/computed clusters for cells profiled.

    results:
        fmi_score(float): fowlkes mallows index
    """ 

    fmi_score = fowlkes_mallows_score(ground_truth, predicted)
    return fmi_score



[docs]def keep_nuclei_and_quality(adata1,overlaps_nucleus=1,qvmin=20):
    """ Redefine cell expression based on nuclei expression an quality of detected reads

    Args:
        adata1 (AnnData): AnnData object with the cells of the experiment before filtereing reads based on quality or nuclear/non-nuclear.

        overlaps_nucleus(int): Keep reads overlapping nucleus only (1) or all (2).

        qvmin(int): Define minimum quality (qv) of reads to keep in the analysis. 

    results:
        adata1nuc(AnnData): AnnData object with the cells redefined based to input parameters.
    """

    if overlaps_nucleus==1:
        subset1=adata1.uns['spots'].loc[adata1.uns['spots']['overlaps_nucleus']==overlaps_nucleus,:]
    if overlaps_nucleus==0:
        subset1=adata1.uns['spots']
    subset1=subset1[subset1['qv']>qvmin]
    ct1=pd.crosstab(subset1['cell_id'],subset1['feature_name'])
    adataobs=adata1.obs.loc[adata1.obs['cell_id'].isin(ct1.index),:]
    av=adata1.var.index[adata1.var.index.isin(ct1.columns)]#.isin(adata1.var.index)
    ct1=ct1.loc[:,av]
    adataobs.index=adataobs['cell_id']
    adataobs.index.name='ind'
    ct1=ct1.loc[ct1.index.isin(adataobs['cell_id']),:]
    adata1nuc=sc.AnnData(np.array(ct1),obs=adataobs)#,var=adata1.var)
    return adata1nuc

# using a baseline, modify parameters one by one
# making it into a function
[docs]def allcombs(adata):
    """ Simulate preprocessing workflows and extract results based on it

    Args:
        adata (AnnData): AnnData object with the cells of the experiment.

    results:
        allres(DataFrame): Clustering obtained with different preprocessing workflows.
    """

    default={'nuc':1,'npcs':0,'neighs':16,'qvs':0,'scale':True,'hvgs':False,'ts':100,'norm':True,'lg':True} 
    try:
        del allres
    except:
        print('first run')
    nucdic={1:'nuc',0:'cell'}
    neighs=[16,12,6,20]
    npcs=[0,15,25]
    ts=[100,10,1000,None]
    qvs=[0,20,30]
    scales=[True,False]
    hvgs=[False,True]
    norm=[True,False]
    logs=[True,False]
    perc=0.1
    for qv in qvs:
        for nuc in [1,0]:
            if nuc==1:
                adata_f1=keep_nuclei_and_quality(adata.copy(),overlaps_nucleus=1,qvmin=qv)
                adata_f1=adata_f1.copy()
            if nuc==0:
                adata_f1=keep_nuclei_and_quality(adata.copy(),overlaps_nucleus=0,qvmin=qv)
                adata_f1.uns['spots']=[]
            for target_sum in ts:
                for neigh in neighs:
                    for nm in norm:
                        for npc in npcs:
                            for scale in scales:
                                for lgo in logs:
                                    for hvg in hvgs:
                                        nv=neigh!=default['neighs']
                                        nv=nv+(nuc!=default['nuc'])
                                        nv=nv+(qv!=default['qvs'])
                                        nv=nv+(npc!=default['npcs'])
                                        nv=nv+(scale!=default['scale'])
                                        nv=nv+(hvg!=default['hvgs'])
                                        nv=nv+(target_sum!=default['ts'])
                                        nv=nv+(nm!=default['norm'])
                                        nv=nv+(lgo!=default['lg'])
                                        #print(nv)
                                        if nv==0:
                                            adata_f2=adata_f1.copy()
                                            print('DEFAULT')
                                            result,ttc=main_preprocessing(adata_f2,neigh=neigh,npc=npc,target_sum=target_sum,scale=scale,hvg=hvg,default=True,norm=nm,lg=lgo)
                                            sumresult=result.obs.loc[:,['leiden_1_4','louvain_1_4']]
                                            sumresult.columns=['DEFAULT_leid','DEFAULT_louv']
                                            try:
                                                allres=pd.merge(allres,sumresult, left_index=True, right_index=True)
                                            except:
                                                allres=sumresult
                                        if nv==1:
                                            adata_f2=adata_f1.copy()
                                            print(nucdic[nuc]+'_qv'+str(qv)+'_neighs'+str(neigh)+'_npc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale))
                                            result=main_preprocessing(adata_f2,neigh=neigh,npc=npc,target_sum=target_sum,scale=scale,hvg=hvg,total_clusters=ttc,norm=nm,lg=lgo)
                                            sumresult=result.obs.loc[:,['leiden_1_4','louvain_1_4']]
                                            sumresult.columns=[nucdic[nuc]+'_qv'+str(qv)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_norm_'+str(nm)+'_log_'+str(lgo)+'_leid',nucdic[nuc]+'_qv'+str(qv)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_norm_'+str(nm)+'_log_'+str(lgo)+'_louv']
                                            try:
                                                allres=pd.merge(allres,sumresult, left_index=True, right_index=True)
                                            except:
                                                allres=sumresult
    return allres



[docs]def main_preprocessing(adata,target_sum=100,mincounts=10,mingenes=3,neigh=15,npc=0,nuc=1,scale=False,hvg=False,default=False,total_clusters=30,default_resol=1.6,logstatus=True,normstatus=True):
    """ preprocess and cluster cells in an Anndata object given some input parameters

    Args:
        adata(AnnData): AnnData object with the cells of the experiment before simulating the missegmentation.

        target_sum(int or None): Target sum to use if the normalization is done based on library size. None is used for automatic calculation of library size.

        mincounts (int): Minimum amount of counts detected in a cell to pass the quality filters.

        mingenes (int): Minimum amount of genes expressed in a cell to pass the quality filters.

        neigh(int): number of neighbors to used when calculating the nearest neighbors by sc.pp.neighbors().

        npc(int): number of principal components to used when calculating the nearest neighbors by sc.pp.neighbors().

        nuc(int): wether to use only nuclear reads (1) or all reads (0).

        scale(boolean): whether to scale the data or not.

        hvg(boolean): whether to select highly variable genes for further processing or not.

        default(boolean): whether the run is the original one or not.

        total_clusters (int): number of clusters to obtain in the process of clustering (+-2).

        default_resol(float): clustering resolution to use as a default when clustering.

        logstatus(boolean): Whether to log-transforms cells.

        normstatus(boolean): Whether to normalize based cells or not.

    results:
        adata(AnnData): AnnData object after preprocessing and clustering.
    """
     
    adata.layers['raw']=adata.X.copy()
    sc.pp.filter_cells(adata,min_counts=mincounts)
    sc.pp.filter_cells(adata,min_genes=mingenes)
    adata.raw=adata
    adata.layers['raw']=adata.X.copy()
    if normstatus=='pearson':
        sc.experimental.pp.normalize_pearson_residuals(adata)
    else:
        if normstatus==True:
            sc.pp.normalize_total(adata, target_sum=target_sum)
        if logstatus==True:
            sc.pp.log1p(adata)
        if hvg==True:
            sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=12, min_disp=0.25)
            adata = adata[:, adata.var.highly_variable]
        if scale==True:
            sc.pp.scale(adata)
    #remove nans, inf and -ind
    adata.X=np.array(adata.to_df().fillna(0))
    adata.X[adata.X==-np.inf]=0
    adata.X[adata.X==np.inf]=0
#    adata.X.fillna(0, inplace=True)
    sc.pp.pca(adata)
    print(adata.shape)
    sc.pp.neighbors(adata, n_neighbors=neigh, n_pcs=npc)
#    sc.tl.leiden(adata,resolution=2.2,key_added='leiden_2_2')
    resol=default_resol
    sc.tl.leiden(adata,resolution=resol,key_added='leiden_1_4')
    numclust=int(np.max(adata.obs['leiden_1_4'].astype(int)))
    targetnum=total_clusters
    if default==True:
        targetnum=numclust
    if abs(numclust-targetnum)>3:
        i=0
        while abs(int(numclust)-int(targetnum))>3:
       #     print('iter '+str(i))
            if numclust>targetnum:
                resol=resol-0.05
            if numclust<targetnum:  
                resol=resol+0.05
            sc.tl.leiden(adata,resolution=resol,key_added='leiden_1_4')
            numclust=np.max(adata.obs['leiden_1_4'].astype(int))
            i=i+1
            if i >100:
                break
    resol=default_resol
    sc.tl.louvain(adata,resolution=resol,key_added='louvain_1_4')
    numclust=np.max(adata.obs['louvain_1_4'].astype(int))
    if abs(numclust-targetnum)>3:
        i=0
        while abs(int(numclust)-int(targetnum))>3:
            if numclust>targetnum:
                resol=resol-0.05
            if numclust<targetnum:  
                resol=resol+0.05
            sc.tl.leiden(adata,resolution=resol,key_added='louvain_1_4')
            numclust=np.max(adata.obs['louvain_1_4'].astype(int))
            i=i+1
            if i >100:
                break
    if default==True:
        return adata,targetnum
    else:
        return adata

[docs]def allcombs_simulated(adata,default_key='class'):
    """ Simulate preprocessing workflows and extract results based on it for simulated data

    Args:
        adata (AnnData): AnnData object with the cells of the experiment.

        default_key(str): name of the column in adata.obs where the reference cell types/clusters are stored.
    
    results:
    allres(DataFrame): Clustering obtained with different preprocessing workflows.
    """
     
    default={'nuc':1,'npcs':0,'neighs':12,'qvs':0,'scale':False,'hvgs':False,'ts':1000000}# default is never done
    try:
        del allres
    except:
        print('first run')
    tot=0
    total_combs=3000#3000 
    neighs=[12,16]#12
    npcs=[0,20,40]
    ts=[10,100,1000,None]#1000
    qvs=[0]
    scales=[False,True]
    hvgs=[False,True]#True
    log=[True,False]#100,80
    norm=[True,False,'pearson']#,
    perc=0.1
    df_resol=0.4
    nuc=1
    for lg in log:
        for target_sum in ts:
            for neigh in neighs:
                for npc in npcs:
                    for scale in scales:
                        for hvg in hvgs:
                            for nm in norm:
                                if tot>total_combs:
                                    print('Broken')
                                    break
                                tot=tot+1
                                print(tot)
                                try:
                                    if nm=='pearson':
                                        print(str(lg)+str(scale)+str(hvg)+str(ts))
                                        if lg==True and scale==True and hvg==True and target_sum==None:
                                            ttc=len(np.unique(adata.obs[default_key]))
                                            adata_f1=adata.copy()
                                            adata_f2=adata_f1.copy()
                                            result=main_preprocessing(adata_f2,neigh=neigh,npc=npc,target_sum=target_sum,scale=scale,hvg=hvg,total_clusters=ttc,default_resol=df_resol,logstatus=lg,normstatus=nm)
                                            sumresult=result.obs.loc[:,['leiden_1_4','louvain_1_4']]
                                            sumresult.columns=['norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_leid','norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_louvain']
                                            try:
                                                allres=pd.merge(allres,sumresult, left_index=True, right_index=True)
                                            except:
                                                allres=sumresult
                                    else:
                                        ttc=len(np.unique(adata.obs[default_key]))
                                        adata_f1=adata.copy()
                                        adata_f2=adata_f1.copy()
                                        result=main_preprocessing(adata_f2,neigh=neigh,npc=npc,target_sum=target_sum,scale=scale,hvg=hvg,total_clusters=ttc,default_resol=df_resol,logstatus=lg,normstatus=nm)
                                        sumresult=result.obs.loc[:,['leiden_1_4','louvain_1_4']]
                                        sumresult.columns=['norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_leid','norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_louvain']
                                        try:
                                            allres=pd.merge(allres,sumresult, left_index=True, right_index=True)
                                        except:
                                            allres=sumresult
                                except:
                                    print('norm'+str(nm)+'_lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale)+'_leid','lg'+str(lg)+'_ng'+str(neigh)+'_pc'+str(npc)+'_ts'+str(target_sum)+'_hvg_'+str(hvg)+'_scale_'+str(scale))

    allres['RESULTS']=adata.obs[default_key]                            
    return allres