""" SpaGE [1]
@author: Tamim Abdelaal
This function integrates two single-cell datasets, spatial and scRNA-seq, and
enhance the spatial data by predicting the expression of the spatially
unmeasured genes from the scRNA-seq data.
The integration is performed using the domain adaption method PRECISE [2]
References
----------
[1] Abdelaal T., Mourragui S., Mahfouz A., Reiders M.J.T. (2020)
SpaGE: Spatial Gene Enhancement using scRNA-seq
[2] Mourragui S., Loog M., Reinders M.J.T., Wessels L.F.A. (2019)
PRECISE: A domain adaptation approach to transfer predictors of drug response
from pre-clinical models to tumors
"""
import numpy as np
import pandas as pd
import scipy.stats as st
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
[docs]def SpaGE(Spatial_data,RNA_data,n_pv,genes_to_predict=None):
"""
@author: Tamim Abdelaal
This function integrates two single-cell datasets, spatial and scRNA-seq,
and enhance the spatial data by predicting the expression of the spatially
unmeasured genes from the scRNA-seq data.
Args:
Spatial_data : Dataframe
Normalized Spatial data matrix (cells X genes).
RNA_data : Dataframe
Normalized scRNA-seq data matrix (cells X genes).
n_pv : int
Number of principal vectors to find from the independently computed
principal components, and used to align both datasets. This should
be <= number of shared genes between the two datasets.
genes_to_predict : str array
list of gene names missing from the spatial data, to be predicted
from the scRNA-seq data. Default is the set of different genes
(columns) between scRNA-seq and spatial data.
results:
Imp_Genes: Dataframe
Matrix containing the predicted gene expressions for the spatial
cells. Rows are equal to the number of spatial data rows (cells),
and columns are equal to genes_to_predict, .
"""
if genes_to_predict is SpaGE.__defaults__[0]:
genes_to_predict = np.setdiff1d(RNA_data.columns,Spatial_data.columns)
RNA_data_scaled = pd.DataFrame(data=st.zscore(RNA_data,axis=0),
index = RNA_data.index,columns=RNA_data.columns)
Spatial_data_scaled = pd.DataFrame(data=st.zscore(Spatial_data,axis=0),
index = Spatial_data.index,columns=Spatial_data.columns)
Common_data = RNA_data_scaled[np.intersect1d(Spatial_data_scaled.columns,RNA_data_scaled.columns)]
Y_train = RNA_data[genes_to_predict]
Imp_Genes = pd.DataFrame(np.zeros((Spatial_data.shape[0],len(genes_to_predict))),
columns=genes_to_predict)
pv_Spatial_RNA = PVComputation(
n_factors = n_pv,
n_pv = n_pv,
dim_reduction = 'pca',
dim_reduction_target = 'pca'
)
pv_Spatial_RNA.fit(Common_data,Spatial_data_scaled[Common_data.columns])
S = pv_Spatial_RNA.source_components_.T
Effective_n_pv = sum(np.diag(pv_Spatial_RNA.cosine_similarity_matrix_) > 0.3)
S = S[:,0:Effective_n_pv]
Common_data_projected = Common_data.dot(S)
Spatial_data_projected = Spatial_data_scaled[Common_data.columns].dot(S)
nbrs = NearestNeighbors(n_neighbors=50, algorithm='auto',
metric = 'cosine').fit(Common_data_projected)
distances, indices = nbrs.kneighbors(Spatial_data_projected)
for j in range(0,Spatial_data.shape[0]):
weights = 1-(distances[j,:][distances[j,:]<1])/(np.sum(distances[j,:][distances[j,:]<1]))
weights = weights/(len(weights)-1)
Imp_Genes.iloc[j,:] = np.dot(weights,Y_train.iloc[indices[j,:][distances[j,:] < 1]])
return Imp_Genes
""" Dimensionality Reduction
@author: Soufiane Mourragui
This module extracts the domain-specific factors from the high-dimensional omics
dataset. Several methods are here implemented and they can be directly
called from string name in main method method. All the methods
use scikit-learn implementation.
Notes
-------
-
References
----------
[1] Pedregosa, Fabian, et al. (2011) Scikit-learn: Machine learning in Python.
Journal of Machine Learning Research
"""
import numpy as np
from sklearn.decomposition import PCA, FastICA, FactorAnalysis, NMF, SparsePCA
from sklearn.cross_decomposition import PLSRegression
[docs]def process_dim_reduction(method='pca', n_dim=10):
""" Default linear dimensionality reduction method. For each method, return a
BaseEstimator instance corresponding to the method given as input.
Args:
method: str, default to 'pca'
Method used for dimensionality reduction.
Implemented: 'pca', 'ica', 'fa' (Factor Analysis),
'nmf' (Non-negative matrix factorisation), 'sparsepca' (Sparse PCA).
n_dim: int, default to 10
Number of domain-specific factors to compute.
results:
Classifier, i.e. BaseEstimator instance
"""
if method.lower() == 'pca':
clf = PCA(n_components=n_dim)
elif method.lower() == 'ica':
print('ICA')
clf = FastICA(n_components=n_dim)
elif method.lower() == 'fa':
clf = FactorAnalysis(n_components=n_dim)
elif method.lower() == 'nmf':
clf = NMF(n_components=n_dim)
elif method.lower() == 'sparsepca':
clf = SparsePCA(n_components=n_dim, alpha=10., tol=1e-4, verbose=10, n_jobs=1)
elif method.lower() == 'pls':
clf = PLS(n_components=n_dim)
else:
raise NameError('%s is not an implemented method'%(method))
return clf
[docs]class PLS():
"""
Implement PLS to make it compliant with the other dimensionality
reduction methodology.
(Simple class rewritting).
"""
def __init__(self, n_components=10):
self.clf = PLSRegression(n_components)
[docs] def get_components_(self):
return self.clf.x_weights_.transpose()
[docs] def set_components_(self, x):
pass
components_ = property(get_components_, set_components_)
[docs] def fit(self, X, y):
self.clf.fit(X,y)
return self
[docs] def predict(self, X):
return self.clf.predict(X)
""" Principal Vectors
@author: Soufiane Mourragui
This module computes the principal vectors from two datasets, i.e.:
- perform linear dimensionality reduction independently for both dataset, resulting
in set of domain-specific factors.
- find the common factors using principal vectors [1]
This result in set of pairs of vectors. Each pair has one vector from the source and one
from the target. For each pair, a similarity score (cosine similarity) can be computed
between the principal vectors and the pairs are naturally ordered by decreasing order
of this similarity measure.
Example
-------
Examples are given in the vignettes.
Notes
-------
Examples are given in the vignette
References
----------
[1] Golub, G.H. and Van Loan, C.F., 2012. "Matrix computations" (Vol. 3). JHU Press.
[2] Mourragui, S., Loog, M., Reinders, M.J.T., Wessels, L.F.A. (2019)
PRECISE: A domain adaptation approach to transfer predictors of drug response
from pre-clinical models to tumors
"""
import numpy as np
import pandas as pd
import scipy
from pathlib import Path
from sklearn.preprocessing import normalize
[docs]class PVComputation:
"""
Attributes:
n_factors: int
Number of domain-specific factors to compute.
n_pv: int
Number of principal vectors.
dim_reduction_method_source: str
Dimensionality reduction method used for source data.
dim_reduction_target: str
Dimensionality reduction method used for source data.
source_components_ : numpy.ndarray, shape (n_pv, n_features)
Loadings of the source principal vectors ranked by similarity to the
target. Components are in the row.
source_explained_variance_ratio_: numpy.ndarray, shape (n_pv)
Explained variance of the source on each source principal vector.
target_components_ : numpy.ndarray, shape (n_pv, n_features)
Loadings of the target principal vectors ranked by similarity to the
source. Components are in the row.
target_explained_variance_ratio_: numpy.ndarray, shape (n_pv)
Explained variance of the target on each target principal vector.
cosine_similarity_matrix_: numpy.ndarray, shape (n_pv, n_pv)
Scalar product between the source and the target principal vectors. Source
principal vectors are in the rows while target's are in the columns. If
the domain adaptation is sensible, a diagonal matrix should be obtained.
"""
def __init__(self, n_factors,n_pv,
dim_reduction='pca',
dim_reduction_target=None,
project_on=0):
"""
Args:
n_factors : int
Number of domain-specific factors to extract from the data (e.g. using PCA, ICA).
n_pv : int
Number of principal vectors to find from the independently computed factors.
dim_reduction : str, default to 'pca'
Dimensionality reduction method for the source data,
i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'.
dim_reduction_target : str, default to None
Dimensionality reduction method for the target data,
i.e. 'pca', 'ica', 'nmf', 'fa', 'sparsepca', pls'. If None, set to dim_reduction.
project_on: int or bool, default to 0
Where data should be projected on. 0 means source PVs, -1 means target PVs and 1 means
both PVs.
"""
self.n_factors = n_factors
self.n_pv = n_pv
self.dim_reduction_method_source = dim_reduction
self.dim_reduction_method_target = dim_reduction_target or dim_reduction
self.dim_reduction_source = self._process_dim_reduction(self.dim_reduction_method_source)
self.dim_reduction_target = self._process_dim_reduction(self.dim_reduction_method_target)
self.source_components_ = None
self.source_explained_variance_ratio_ = None
self.target_components_ = None
self.target_explained_variance_ratio_ = None
self.cosine_similarity_matrix_ = None
def _process_dim_reduction(self, dim_reduction):
if type(dim_reduction) == str:
return process_dim_reduction(method=dim_reduction, n_dim=self.n_factors)
else:
return dim_reduction
[docs] def fit(self, X_source, X_target, y_source=None):
""" Compute the common factors between two set of data.
IMPORTANT: Same genes have to be given for source and target, and in same order
Args:
X_source : np.ndarray, shape (n_components, n_genes)
Source dataset.
X_target : np.ndarray, shape (n_components, n_genes)
Target dataset.
y_source : np.ndarray, shape (n_components, 1) (optional, default to None)
Eventual output, in case one wants to give ouput (for instance PLS).
results:
self: returns an instance of self.
"""
# Compute factors independently for source and target. Orthogonalize the basis
Ps = self.dim_reduction_source.fit(X_source, y_source).components_
Ps = scipy.linalg.orth(Ps.transpose()).transpose()
Pt = self.dim_reduction_target.fit(X_target, y_source).components_
Pt = scipy.linalg.orth(Pt.transpose()).transpose()
# Compute the principal factors
self.compute_principal_vectors(Ps, Pt)
# Compute variance explained
self.source_explained_variance_ratio_ = np.var(self.source_components_.dot(X_source.transpose()), axis=1)/\
np.sum(np.var(X_source), axis=0)
self.target_explained_variance_ratio_ = np.var(self.target_components_.dot(X_target.transpose()), axis=1)/\
np.sum(np.var(X_target), axis=0)
return self
[docs] def compute_principal_vectors(self, source_factors, target_factors):
"""Compute the principal vectors between the already computed set of domain-specific
factors, using approach presented in [1,2].
IMPORTANT: Same genes have to be given for source and target, and in same order
Args:
source_factors: np.ndarray, shape (n_components, n_genes)
Source domain-specific factors.
target_factors: np.ndarray, shape (n_components, n_genes)
Target domain-specific factors.
results:
self: returns an instance of self.
"""
# Find principal vectors using SVD
u,sigma,v = np.linalg.svd(source_factors.dot(target_factors.transpose()))
self.source_components_ = u.transpose().dot(source_factors)[:self.n_pv]
self.target_components_ = v.dot(target_factors)[:self.n_pv]
# Normalize to make sure that vectors are unitary
self.source_components_ = normalize(self.source_components_, axis = 1)
self.target_components_ = normalize(self.target_components_, axis = 1)
# Compute cosine similarity matrix
self.initial_cosine_similarity_matrix_ = source_factors.dot(target_factors.transpose())
self.cosine_similarity_matrix_ = self.source_components_.dot(self.target_components_.transpose())
# Compute angles
self.angles_ = np.arccos(np.diag(self.cosine_similarity_matrix_))
return self
[docs]def leave_one_out_validation(adata,sc_adata,genes:list):
'''Function to validate the imputation of genes using SpaGe'''
RNA_data=sc_adata.to_df()
RNA_meta=sc_adata.obs
spatial_data=adata.to_df()
spatial_meta=adata.obs
Correlations = pd.Series(index = genes)
plt.style.use('dark_background')
spatialcoords=pd.DataFrame(adata.obsm['spatial'])
allimp=[]
for i in genes:
Imp_Genes = SpaGE(spatial_data.drop(i,axis=1),RNA_data,n_pv=30,
genes_to_predict = [i])
Correlations[i] = st.spearmanr(spatial_data[i],Imp_Genes[i])[0]
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.axis('off')
cmap = spatial_data[i]
cmap[cmap > np.percentile(cmap,99)] = np.percentile(cmap,99)
ax1.scatter(spatialcoords.iloc[:,0],spatialcoords.iloc[:,1],s=1,c=cmap)
ax1.set_title('Measured ' + i, fontsize = 12)
ax1.set_ylabel(i)
ax2.axis('off')
cmap = Imp_Genes[i]
cmap[cmap > np.percentile(cmap,99)] = np.percentile(cmap,99)
ax2.scatter(spatialcoords.iloc[:,0],spatialcoords.iloc[:,1],s=1,c=cmap)
ax2.set_title('Predicted ' + i, fontsize = 12)
allimp.append(Imp_Genes)
return pd.concat(allimp)
[docs]def gene_imputation(adata,sc_adata,new_genes:list):
'''Function to impute genes using SpaGe'''
RNA_data=sc_adata.to_df()
RNA_meta=sc_adata.obs
spatial_data=adata.to_df()
spatial_meta=adata.obs
Correlations = pd.Series(index = new_genes)
plt.style.use('dark_background')
spatialcoords=pd.DataFrame(adata.obsm['spatial'])
Imp_New_Genes = SpaGE(spatial_data,RNA_data,n_pv=30,genes_to_predict = new_genes)
return Imp_New_Genes