Source code for ALLCools.clustering.art_of_tsne

"""
The function art_of_tsne is from cytograph2 package
https://github.com/linnarsson-lab/cytograph2/blob/master/cytograph/embedding/art_of_tsne.py

The idea behind that is based on :cite:p:`Kobak2019` with T-SNE algorithm implemented in
[openTSNE](https://opentsne.readthedocs.io/en/latest/) :cite:p:`Policar2019`.

"""

import logging
from typing import Callable, Union

import numpy as np
from openTSNE import TSNEEmbedding, affinity, initialization


[docs]def art_of_tsne( X: np.ndarray, metric: Union[str, Callable] = "euclidean", exaggeration: float = -1, perplexity: int = 30, n_jobs: int = -1, ) -> TSNEEmbedding: """ Implementation of Dmitry Kobak and Philipp Berens "The art of using t-SNE for single-cell transcriptomics" based on openTSNE. See https://doi.org/10.1038/s41467-019-13056-x | www.nature.com/naturecommunications Parameters ---------- X The data matrix of shape (n_cells, n_genes) i.e. (n_samples, n_features) metric Any metric allowed by PyNNDescent (default: 'euclidean') exaggeration The exaggeration to use for the embedding perplexity The perplexity to use for the embedding n_jobs Number of CPUs to use Returns ------- The embedding as an opentsne.TSNEEmbedding object (which can be cast to an np.ndarray) """ n = X.shape[0] if n > 100_000: if exaggeration == -1: exaggeration = 1 + n / 333_333 # Subsample, optimize, then add the remaining cells and optimize again # Also, use exaggeration == 4 logging.info(f"Creating subset of {n // 40} elements") # Subsample and run a regular art_of_tsne on the subset indices = np.random.permutation(n) reverse = np.argsort(indices) X_sample, X_rest = X[indices[: n // 40]], X[indices[n // 40 :]] logging.info(f"Embedding subset") Z_sample = art_of_tsne(X_sample, metric=metric) logging.info( f"Preparing partial initial embedding of the {n - n // 40} remaining elements" ) if isinstance(Z_sample.affinities, affinity.Multiscale): rest_init = Z_sample.prepare_partial( X_rest, k=1, perplexities=[1 / 3, 1 / 3] ) else: rest_init = Z_sample.prepare_partial(X_rest, k=1, perplexity=1 / 3) logging.info(f"Combining the initial embeddings, and standardizing") init_full = np.vstack((Z_sample, rest_init))[reverse] init_full = init_full / (np.std(init_full[:, 0]) * 10000) logging.info(f"Creating multiscale affinities") affinities = affinity.PerplexityBasedNN( X, perplexity=perplexity, metric=metric, method="approx", n_jobs=n_jobs ) logging.info(f"Creating TSNE embedding") Z = TSNEEmbedding( init_full, affinities, negative_gradient_method="fft", n_jobs=n_jobs ) logging.info(f"Optimizing, stage 1") Z.optimize( n_iter=250, inplace=True, exaggeration=12, momentum=0.5, learning_rate=n / 12, n_jobs=n_jobs, ) logging.info(f"Optimizing, stage 2") Z.optimize( n_iter=750, inplace=True, exaggeration=exaggeration, momentum=0.8, learning_rate=n / 12, n_jobs=n_jobs, ) elif n > 3_000: if exaggeration == -1: exaggeration = 1 # Use multiscale perplexity affinities_multiscale_mixture = affinity.Multiscale( X, perplexities=[perplexity, n / 100], metric=metric, method="approx", n_jobs=n_jobs, ) init = initialization.pca(X) Z = TSNEEmbedding( init, affinities_multiscale_mixture, negative_gradient_method="fft", n_jobs=n_jobs, ) Z.optimize( n_iter=250, inplace=True, exaggeration=12, momentum=0.5, learning_rate=n / 12, n_jobs=n_jobs, ) Z.optimize( n_iter=750, inplace=True, exaggeration=exaggeration, momentum=0.8, learning_rate=n / 12, n_jobs=n_jobs, ) else: if exaggeration == -1: exaggeration = 1 # Just a plain TSNE with high learning rate lr = max(200, n / 12) aff = affinity.PerplexityBasedNN( X, perplexity=perplexity, metric=metric, method="approx", n_jobs=n_jobs ) init = initialization.pca(X) Z = TSNEEmbedding( init, aff, learning_rate=lr, n_jobs=n_jobs, negative_gradient_method="fft" ) Z.optimize(250, exaggeration=12, momentum=0.5, inplace=True, n_jobs=n_jobs) Z.optimize( 750, exaggeration=exaggeration, momentum=0.8, inplace=True, n_jobs=n_jobs ) return Z
[docs]def tsne( adata, obsm="X_pca", metric: Union[str, Callable] = "euclidean", exaggeration: float = -1, perplexity: int = 30, n_jobs: int = -1, ): """ Calculating T-SNE embedding with the openTSNE package :cite:p:`Policar2019` and parameter optimization strategy described in :cite:p:`Kobak2019`. Parameters ---------- adata adata object with principle components or equivalent matrix stored in .obsm obsm name of the matrix in .obsm that can be used as T-SNE input metric Any metric allowed by PyNNDescent (default: 'euclidean') exaggeration The exaggeration to use for the embedding perplexity The perplexity to use for the embedding n_jobs Number of CPUs to use Returns ------- T-SNE embedding will be stored at adata.obsm["X_tsne"] """ X = adata.obsm[obsm] Z = art_of_tsne( X=X, metric=metric, exaggeration=exaggeration, perplexity=perplexity, n_jobs=n_jobs, ) adata.obsm["X_tsne"] = Z return