Source code for ALLCools.sandbox.scanorama

"""
Allow to chose different metric

Modified from Scanorama, see LICENSE
https://github.com/brianhie/scanorama/blob/master/LICENSE
"""

import operator
import random
import sys
import warnings
import pandas as pd
import numpy as np
import scipy
from annoy import AnnoyIndex
from intervaltree import IntervalTree
from fbpca import pca
from scipy.sparse import csc_matrix, csr_matrix, vstack
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

# Default parameters.
[docs]ALPHA = 0.10
[docs]APPROX = True
[docs]BATCH_SIZE = 5000
[docs]DIMRED = 100
[docs]HVG = None
[docs]KNN = 20
[docs]N_ITER = 500
[docs]PERPLEXITY = 1200
[docs]SIGMA = 15
[docs]VERBOSE = 2
[docs]def batch_correct_pc( adata, batch_series, correct=False, n_components=30, sigma=25, alpha=0.1, knn=30, metric="angular", **scanorama_kws, ): """ Batch correction PCA based on integration Parameters ---------- adata one major adata batch_series batch_series used for splitting adata correct if True, adata.X will be corrected inplace, otherwise only corrected PCs are added to adata.obsm['X_pca'] n_components number of components in PCA sigma Correction smoothing parameter on Gaussian kernel. alpha Alignment score minimum cutoff. knn Number of nearest neighbors to use for matching. metric Metric to use in calculating KNN scanorama_kws Other Parameters passed to integration function Returns ------- adata """ scanorama_kws["dimred"] = n_components scanorama_kws["sigma"] = sigma scanorama_kws["alpha"] = alpha scanorama_kws["knn"] = knn scanorama_kws["metric"] = metric adata.obs["batch"] = batch_series adata_list = [] indexes = [] for _, sub_df in adata.obs.groupby("batch"): adata_list.append(adata[sub_df.index, :]) indexes.extend(sub_df.index.tolist()) if correct: integrated, corrected = correct_scanpy( adata_list, return_dimred=True, **scanorama_kws ) adata.X = np.vstack([ann.X.toarray() for ann in corrected]) else: integrated = integrate_scanpy(adata_list, **scanorama_kws) pca_df = pd.DataFrame(np.vstack(integrated), index=indexes).reindex(adata.obs_names) adata.obsm["X_pca"] = pca_df.values # TODO fill up other PC related parts same as sc.tl.pca return adata
# Do batch correction on a list of data sets.
[docs]def correct( datasets_full, genes_list, return_dimred=False, batch_size=BATCH_SIZE, verbose=VERBOSE, ds_names=None, dimred=DIMRED, approx=APPROX, sigma=SIGMA, alpha=ALPHA, knn=KNN, return_dense=False, hvg=None, union=False, geosketch=False, geosketch_max=20000, seed=0, metric="manhattan", ): """Integrate and batch correct a list of data sets. Parameters ---------- datasets_full : `list` of `scipy.sparse.csr_matrix` or of `numpy.ndarray` Data sets to integrate and correct. genes_list: `list` of `list` of `string` List of genes for each data set. return_dimred: `bool`, optional (default: `False`) In addition to returning batch corrected matrices, also returns integrated low-dimesional embeddings. batch_size: `int`, optional (default: `5000`) The batch size used in the alignment vector computation. Useful when correcting very large (>100k samples) data sets. Set to large value that runs within available memory. verbose: `bool` or `int`, optional (default: 2) When `True` or not equal to 0, prints logging output. ds_names: `list` of `string`, optional When `verbose=True`, reports data set names in logging output. dimred: `int`, optional (default: 100) Dimensionality of integrated embedding. approx: `bool`, optional (default: `True`) Use approximate nearest neighbors, greatly speeds up matching runtime. sigma: `float`, optional (default: 15) Correction smoothing parameter on Gaussian kernel. alpha: `float`, optional (default: 0.10) Alignment score minimum cutoff. knn: `int`, optional (default: 20) Number of nearest neighbors to use for matching. return_dense: `bool`, optional (default: `False`) Return `numpy.ndarray` matrices instead of `scipy.sparse.csr_matrix`. hvg: `int`, optional (default: None) Use this number of top highly variable genes based on dispersion. seed: `int`, optional (default: 0) Random seed to use. Returns ------- corrected, genes By default (`return_dimred=False`), returns a two-tuple containing a list of `scipy.sparse.csr_matrix` each with batch corrected values, and a single list of genes containing the intersection of inputted genes. integrated, corrected, genes When `return_dimred=False`, returns a three-tuple containing a list of `numpy.ndarray` with integrated low dimensional embeddings, a list of `scipy.sparse.csr_matrix` each with batch corrected values, and a a single list of genes containing the intersection of inputted genes. """ np.random.seed(seed) random.seed(seed) datasets_full = check_datasets(datasets_full) datasets, genes = merge_datasets( datasets_full, genes_list, ds_names=ds_names, union=union ) datasets_dimred, genes = process_data(datasets, genes, hvg=hvg, dimred=dimred) datasets_dimred = assemble( datasets_dimred, # Assemble in low dimensional space. expr_datasets=datasets, # Modified in place. verbose=verbose, knn=knn, sigma=sigma, approx=approx, alpha=alpha, ds_names=ds_names, batch_size=batch_size, geosketch=geosketch, geosketch_max=geosketch_max, metric=metric, ) if return_dense: datasets = [ds.toarray() for ds in datasets] if return_dimred: return datasets_dimred, datasets, genes return datasets, genes
# Integrate a list of data sets.
[docs]def integrate( datasets_full, genes_list, batch_size=BATCH_SIZE, verbose=VERBOSE, ds_names=None, dimred=DIMRED, approx=APPROX, sigma=SIGMA, alpha=ALPHA, knn=KNN, geosketch=False, geosketch_max=20000, n_iter=1, union=False, hvg=None, seed=0, metric="manhattan", ): """Integrate a list of data sets. Parameters ---------- datasets_full : `list` of `scipy.sparse.csr_matrix` or of `numpy.ndarray` Data sets to integrate and correct. genes_list: `list` of `list` of `string` List of genes for each data set. batch_size: `int`, optional (default: `5000`) The batch size used in the alignment vector computation. Useful when correcting very large (>100k samples) data sets. Set to large value that runs within available memory. verbose: `bool` or `int`, optional (default: 2) When `True` or not equal to 0, prints logging output. ds_names: `list` of `string`, optional When `verbose=True`, reports data set names in logging output. dimred: `int`, optional (default: 100) Dimensionality of integrated embedding. approx: `bool`, optional (default: `True`) Use approximate nearest neighbors, greatly speeds up matching runtime. sigma: `float`, optional (default: 15) Correction smoothing parameter on Gaussian kernel. alpha: `float`, optional (default: 0.10) Alignment score minimum cutoff. knn: `int`, optional (default: 20) Number of nearest neighbors to use for matching. hvg: `int`, optional (default: None) Use this number of top highly variable genes based on dispersion. seed: `int`, optional (default: 0) Random seed to use. Returns ------- integrated, genes Returns a two-tuple containing a list of `numpy.ndarray` with integrated low dimensional embeddings and a single list of genes containing the intersection of inputted genes. """ np.random.seed(seed) random.seed(seed) datasets_full = check_datasets(datasets_full) datasets, genes = merge_datasets( datasets_full, genes_list, ds_names=ds_names, union=union ) datasets_dimred, genes = process_data(datasets, genes, hvg=hvg, dimred=dimred) for _ in range(n_iter): datasets_dimred = assemble( datasets_dimred, # Assemble in low dimensional space. verbose=verbose, knn=knn, sigma=sigma, approx=approx, alpha=alpha, ds_names=ds_names, batch_size=batch_size, geosketch=geosketch, geosketch_max=geosketch_max, metric=metric, ) return datasets_dimred, genes
# Batch correction with scanpy's AnnData object.
[docs]def correct_scanpy(adatas, **kwargs): """Batch correct a list of `scanpy.api.AnnData`. Parameters ---------- adatas : `list` of `scanpy.api.AnnData` Data sets to integrate and/or correct. kwargs : `dict` See documentation for the `correct()` method for a full list of parameters to use for batch correction. Returns ------- corrected By default (`return_dimred=False`), returns a list of new `scanpy.api.AnnData`. integrated, corrected When `return_dimred=True`, returns a two-tuple containing a list of `np.ndarray` with integrated low-dimensional embeddings and a list of new `scanpy.api.AnnData`. """ if "return_dimred" in kwargs and kwargs["return_dimred"]: datasets_dimred, datasets, genes = correct( [adata.X for adata in adatas], [adata.var_names.values for adata in adatas], **kwargs, ) else: datasets, genes = correct( [adata.X for adata in adatas], [adata.var_names.values for adata in adatas], **kwargs, ) from anndata import AnnData new_adatas = [] for i in range(len((adatas))): adata = AnnData(datasets[i]) adata.var_names = genes new_adatas.append(adata) if "return_dimred" in kwargs and kwargs["return_dimred"]: return datasets_dimred, new_adatas else: return new_adatas
# Integration with scanpy's AnnData object.
[docs]def integrate_scanpy(adatas, **kwargs): """Integrate a list of `scanpy.api.AnnData`. Parameters ---------- adatas : `list` of `scanpy.api.AnnData` Data sets to integrate. kwargs : `dict` See documentation for the `integrate()` method for a full list of parameters to use for batch correction. Returns ------- integrated Returns a list of `np.ndarray` with integrated low-dimensional embeddings. """ datasets_dimred, genes = integrate( [adata.X for adata in adatas], [adata.var_names.values for adata in adatas], **kwargs, ) return datasets_dimred
# Put datasets into a single matrix with the intersection of all genes.
[docs]def merge_datasets(datasets, genes, ds_names=None, verbose=True, union=False): if union: sys.stderr.write( "WARNING: Integrating based on the union of genes is " "highly discouraged, consider taking the intersection " "or requantifying gene expression.\n" ) # Find genes in common. keep_genes = set() for idx, gene_list in enumerate(genes): if len(keep_genes) == 0: keep_genes = set(gene_list) elif union: keep_genes |= set(gene_list) else: keep_genes &= set(gene_list) if not union and not ds_names is None and verbose: print("After {}: {} genes".format(ds_names[idx], len(keep_genes))) if len(keep_genes) == 0: print("Error: No genes found in all datasets, exiting...") exit(1) if verbose: print("Found {} genes among all datasets".format(len(keep_genes))) if union: union_genes = sorted(keep_genes) for i in range(len(datasets)): if verbose: print("Processing data set {}".format(i)) X_new = np.zeros((datasets[i].shape[0], len(union_genes))) X_old = csc_matrix(datasets[i]) gene_to_idx = {gene: idx for idx, gene in enumerate(genes[i])} for j, gene in enumerate(union_genes): if gene in gene_to_idx: X_new[:, j] = X_old[:, gene_to_idx[gene]].toarray().flatten() datasets[i] = csr_matrix(X_new) ret_genes = np.array(union_genes) else: # Only keep genes in common. ret_genes = np.array(sorted(keep_genes)) for i in range(len(datasets)): # Remove duplicate genes. uniq_genes, uniq_idx = np.unique(genes[i], return_index=True) datasets[i] = datasets[i][:, uniq_idx] # Do gene filtering. gene_sort_idx = np.argsort(uniq_genes) gene_idx = [idx for idx in gene_sort_idx if uniq_genes[idx] in keep_genes] datasets[i] = datasets[i][:, gene_idx] assert np.array_equal(uniq_genes[gene_idx], ret_genes) return datasets, ret_genes
[docs]def check_datasets(datasets_full): datasets_new = [] for i, ds in enumerate(datasets_full): if issubclass(type(ds), np.ndarray): datasets_new.append(csr_matrix(ds)) elif issubclass(type(ds), scipy.sparse.csr.csr_matrix): datasets_new.append(ds) else: sys.stderr.write( "ERROR: Data sets must be numpy array or " "scipy.sparse.csr_matrix, received type " "{}.\n".format(type(ds)) ) exit(1) return datasets_new
[docs]def reduce_dimensionality(X, dim_red_k=100): k = min((dim_red_k, X.shape[0], X.shape[1])) U, s, Vt = pca(X, k=k) # Automatically centers. return U[:, range(k)] * s[range(k)]
# Randomized SVD.
[docs]def dimensionality_reduce(datasets, dimred=DIMRED): X = vstack(datasets) X = reduce_dimensionality(X, dim_red_k=dimred) datasets_dimred = [] base = 0 for ds in datasets: datasets_dimred.append(X[base : (base + ds.shape[0]), :]) base += ds.shape[0] return datasets_dimred
[docs]def dispersion(X): mean = X.mean(0) dispersion = np.zeros(mean.shape) nonzero_idx = np.nonzero(mean > 1e-10)[1] X_nonzero = X[:, nonzero_idx] nonzero_mean = X_nonzero.mean(0) nonzero_var = (X_nonzero.multiply(X_nonzero)).mean(0) temp = nonzero_var / nonzero_mean dispersion[mean > 1e-10] = temp.A1 dispersion[mean <= 1e-10] = float("-inf") return dispersion
# Normalize and reduce dimensionality.
[docs]def process_data(datasets, genes, hvg=HVG, dimred=DIMRED, verbose=True): # Only keep highly variable genes if not hvg is None and hvg > 0 and hvg < len(genes): if verbose: print("Highly variable filter...") X = vstack(datasets) disp = dispersion(X) highest_disp_idx = np.argsort(disp[0])[::-1] top_genes = set(genes[highest_disp_idx[range(hvg)]]) for i in range(len(datasets)): gene_idx = [idx for idx, g_i in enumerate(genes) if g_i in top_genes] datasets[i] = datasets[i][:, gene_idx] genes = np.array(sorted(top_genes)) # Normalize. if verbose: print("Normalizing...") for i, ds in enumerate(datasets): datasets[i] = normalize(ds, axis=1) # Compute compressed embedding. if dimred > 0: if verbose: print("Reducing dimension...") datasets_dimred = dimensionality_reduce(datasets, dimred=dimred) if verbose: print("Done processing.") return datasets_dimred, genes if verbose: print("Done processing.") return datasets, genes
# Exact nearest neighbors search.
[docs]def nn(ds1, ds2, knn=KNN, metric_p=2): # Find nearest neighbors of first dataset. nn_ = NearestNeighbors(knn, p=metric_p) nn_.fit(ds2) ind = nn_.kneighbors(ds1, return_distance=False) match = set() for a, b in zip(range(ds1.shape[0]), ind): for b_i in b: match.add((a, b_i)) return match
# Approximate nearest neighbors using locality sensitive hashing.
[docs]def nn_approx(ds1, ds2, knn=KNN, metric="manhattan", n_trees=10): # Build index. print(f"AnnoyIndex using {metric} metric") a = AnnoyIndex(ds2.shape[1], metric=metric) for i in range(ds2.shape[0]): a.add_item(i, ds2[i, :]) a.build(n_trees) # Search index. ind = [] for i in range(ds1.shape[0]): ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1)) ind = np.array(ind) # Match. match = set() for a, b in zip(range(ds1.shape[0]), ind): for b_i in b: match.add((a, b_i)) return match
# Populate a table (in place) that stores mutual nearest neighbors # between datasets.
[docs]def fill_table( table, i, curr_ds, datasets, base_ds=0, knn=KNN, approx=APPROX, metric="manhattan" ): curr_ref = np.concatenate(datasets) if approx: match = nn_approx(curr_ds, curr_ref, knn=knn, metric=metric) else: match = nn(curr_ds, curr_ref, knn=knn, metric_p=1) # Build interval tree. itree_ds_idx = IntervalTree() itree_pos_base = IntervalTree() pos = 0 for j in range(len(datasets)): n_cells = datasets[j].shape[0] itree_ds_idx[pos : (pos + n_cells)] = base_ds + j itree_pos_base[pos : (pos + n_cells)] = pos pos += n_cells # Store all mutual nearest neighbors between datasets. for d, r in match: interval = itree_ds_idx[r] assert len(interval) == 1 j = interval.pop().data interval = itree_pos_base[r] assert len(interval) == 1 base = interval.pop().data if not (i, j) in table: table[(i, j)] = set() table[(i, j)].add((d, r - base)) assert r - base >= 0
[docs]gs_idxs = None
# Fill table of alignment scores.
[docs]def find_alignments_table( datasets, knn=KNN, approx=APPROX, verbose=VERBOSE, prenormalized=False, geosketch=False, geosketch_max=20000, metric="manhattan", ): if not prenormalized: datasets = [normalize(ds, axis=1) for ds in datasets] if geosketch: # Only match cells in geometric sketches. raise NotImplementedError table = {} for i in range(len(datasets)): if len(datasets[:i]) > 0: fill_table( table, i, datasets[i], datasets[:i], knn=knn, approx=approx, metric=metric, ) if len(datasets[i + 1 :]) > 0: fill_table( table, i, datasets[i], datasets[i + 1 :], knn=knn, base_ds=i + 1, approx=approx, metric=metric, ) # Count all mutual nearest neighbors between datasets. matches = {} table1 = {} if verbose > 1: table_print = np.zeros((len(datasets), len(datasets))) for i in range(len(datasets)): for j in range(len(datasets)): if i >= j: continue if not (i, j) in table or not (j, i) in table: continue match_ij = table[(i, j)] match_ji = set([(b, a) for a, b in table[(j, i)]]) matches[(i, j)] = match_ij & match_ji table1[(i, j)] = max( float(len(set([idx for idx, _ in matches[(i, j)]]))) / datasets[i].shape[0], float(len(set([idx for _, idx in matches[(i, j)]]))) / datasets[j].shape[0], ) if verbose > 1: table_print[i, j] += table1[(i, j)] if geosketch: # Translate matches within geometric sketches to original indices. matches_mnn = matches[(i, j)] matches[(i, j)] = [ (gs_idxs[i][a], gs_idxs[j][b]) for a, b in matches_mnn ] if verbose > 1: print(table_print) return table1, table_print, matches else: return table1, None, matches
# Find the matching pairs of cells between datasets.
[docs]def find_alignments( datasets, knn=KNN, approx=APPROX, verbose=VERBOSE, alpha=ALPHA, prenormalized=False, geosketch=False, geosketch_max=20000, metric="manhattan", ): table1, _, matches = find_alignments_table( datasets, knn=knn, approx=approx, verbose=verbose, prenormalized=prenormalized, geosketch=geosketch, geosketch_max=geosketch_max, metric=metric, ) alignments = [ (i, j) for (i, j), val in reversed(sorted(table1.items(), key=operator.itemgetter(1))) if val > alpha ] return alignments, matches
# Find connections between datasets to identify panoramas.
[docs]def connect( datasets, knn=KNN, approx=APPROX, alpha=ALPHA, verbose=VERBOSE, metric="manhattan" ): # Find alignments. alignments, _ = find_alignments( datasets, knn=knn, approx=approx, alpha=alpha, verbose=verbose, metric=metric, ) if verbose: print(alignments) panoramas = [] connected = set() for i, j in alignments: # See if datasets are involved in any current panoramas. panoramas_i = [panoramas[p] for p in range(len(panoramas)) if i in panoramas[p]] assert len(panoramas_i) <= 1 panoramas_j = [panoramas[p] for p in range(len(panoramas)) if j in panoramas[p]] assert len(panoramas_j) <= 1 if len(panoramas_i) == 0 and len(panoramas_j) == 0: panoramas.append([i]) panoramas_i = [panoramas[-1]] if len(panoramas_i) == 0: panoramas_j[0].append(i) elif len(panoramas_j) == 0: panoramas_i[0].append(j) elif panoramas_i[0] != panoramas_j[0]: panoramas_i[0] += panoramas_j[0] panoramas.remove(panoramas_j[0]) connected.add(i) connected.add(j) for i in range(len(datasets)): if not i in connected: panoramas.append([i]) return panoramas
[docs]def handle_zeros_in_scale(scale, copy=True): """Makes sure that whenever scale is zero, we handle it correctly. This happens in most scalers when we have constant features. Adapted from sklearn.preprocessing.data""" # if we are fitting on 1D arrays, scale might be a scalar if np.isscalar(scale): if scale == 0.0: scale = 1.0 return scale elif isinstance(scale, np.ndarray): if copy: # New array to avoid side-effects scale = scale.copy() scale[scale == 0.0] = 1.0 return scale
# To reduce memory usage, split bias computation into batches.
[docs]def batch_bias(curr_ds, match_ds, bias, batch_size=None, sigma=SIGMA): if batch_size is None: weights = rbf_kernel(curr_ds, match_ds, gamma=0.5 * sigma) weights = normalize(weights, axis=1, norm="l1") avg_bias = np.dot(weights, bias) return avg_bias base = 0 avg_bias = np.zeros(curr_ds.shape) denom = np.zeros(curr_ds.shape[0]) while base < match_ds.shape[0]: batch_idx = range(base, min(base + batch_size, match_ds.shape[0])) weights = rbf_kernel(curr_ds, match_ds[batch_idx, :], gamma=0.5 * sigma) avg_bias += np.dot(weights, bias[batch_idx, :]) denom += np.sum(weights, axis=1) base += batch_size denom = handle_zeros_in_scale(denom, copy=False) avg_bias /= denom[:, np.newaxis] return avg_bias
# Compute nonlinear translation vectors between dataset # and a reference.
[docs]def transform( curr_ds, curr_ref, ds_ind, ref_ind, sigma=SIGMA, cn=False, batch_size=None ): # Compute the matching. match_ds = curr_ds[ds_ind, :] match_ref = curr_ref[ref_ind, :] bias = match_ref - match_ds if cn: match_ds = match_ds.toarray() curr_ds = curr_ds.toarray() bias = bias.toarray() with warnings.catch_warnings(): warnings.filterwarnings("error") try: avg_bias = batch_bias( curr_ds, match_ds, bias, sigma=sigma, batch_size=batch_size ) except RuntimeWarning: sys.stderr.write( "WARNING: Oversmoothing detected, refusing to batch " "correct, consider lowering sigma value.\n" ) return csr_matrix(curr_ds.shape, dtype=float) except MemoryError: if batch_size is None: sys.stderr.write( "WARNING: Out of memory, consider turning on " "batched computation with batch_size parameter.\n" ) else: sys.stderr.write( "WARNING: Out of memory, consider lowering " "the batch_size parameter.\n" ) return csr_matrix(curr_ds.shape, dtype=float) if cn: avg_bias = csr_matrix(avg_bias) return avg_bias
# Finds alignments between datasets and uses them to construct # panoramas. "Merges" datasets by correcting gene expression # values.
[docs]def assemble( datasets, verbose=VERBOSE, knn=KNN, sigma=SIGMA, approx=APPROX, alpha=ALPHA, expr_datasets=None, ds_names=None, batch_size=None, geosketch=False, geosketch_max=20000, alignments=None, matches=None, metric="manhattan", ): if len(datasets) == 1: return datasets if alignments is None and matches is None: alignments, matches = find_alignments( datasets, knn=knn, approx=approx, alpha=alpha, verbose=verbose, geosketch=geosketch, geosketch_max=geosketch_max, metric=metric, ) ds_assembled = {} panoramas = [] for i, j in alignments: if verbose: if ds_names is None: print("Processing datasets {}".format((i, j))) else: print("Processing datasets {} <=> {}".format(ds_names[i], ds_names[j])) # Only consider a dataset a fixed amount of times. if not i in ds_assembled: ds_assembled[i] = 0 ds_assembled[i] += 1 if not j in ds_assembled: ds_assembled[j] = 0 ds_assembled[j] += 1 if ds_assembled[i] > 3 and ds_assembled[j] > 3: continue # See if datasets are involved in any current panoramas. panoramas_i = [panoramas[p] for p in range(len(panoramas)) if i in panoramas[p]] assert len(panoramas_i) <= 1 panoramas_j = [panoramas[p] for p in range(len(panoramas)) if j in panoramas[p]] assert len(panoramas_j) <= 1 if len(panoramas_i) == 0 and len(panoramas_j) == 0: if datasets[i].shape[0] < datasets[j].shape[0]: i, j = j, i panoramas.append([i]) panoramas_i = [panoramas[-1]] # Map dataset i to panorama j. if len(panoramas_i) == 0: curr_ds = datasets[i] curr_ref = np.concatenate([datasets[p] for p in panoramas_j[0]]) match = [] base = 0 for p in panoramas_j[0]: if i < p and (i, p) in matches: match.extend([(a, b + base) for a, b in matches[(i, p)]]) elif i > p and (p, i) in matches: match.extend([(b, a + base) for a, b in matches[(p, i)]]) base += datasets[p].shape[0] ds_ind = [a for a, _ in match] ref_ind = [b for _, b in match] bias = transform( curr_ds, curr_ref, ds_ind, ref_ind, sigma=sigma, batch_size=batch_size ) datasets[i] = curr_ds + bias if expr_datasets: curr_ds = expr_datasets[i] curr_ref = vstack([expr_datasets[p] for p in panoramas_j[0]]) bias = transform( curr_ds, curr_ref, ds_ind, ref_ind, sigma=sigma, cn=True, batch_size=batch_size, ) expr_datasets[i] = curr_ds + bias panoramas_j[0].append(i) # Map dataset j to panorama i. elif len(panoramas_j) == 0: curr_ds = datasets[j] curr_ref = np.concatenate([datasets[p] for p in panoramas_i[0]]) match = [] base = 0 for p in panoramas_i[0]: if j < p and (j, p) in matches: match.extend([(a, b + base) for a, b in matches[(j, p)]]) elif j > p and (p, j) in matches: match.extend([(b, a + base) for a, b in matches[(p, j)]]) base += datasets[p].shape[0] ds_ind = [a for a, _ in match] ref_ind = [b for _, b in match] bias = transform( curr_ds, curr_ref, ds_ind, ref_ind, sigma=sigma, batch_size=batch_size ) datasets[j] = curr_ds + bias if expr_datasets: curr_ds = expr_datasets[j] curr_ref = vstack([expr_datasets[p] for p in panoramas_i[0]]) bias = transform( curr_ds, curr_ref, ds_ind, ref_ind, sigma=sigma, cn=True, batch_size=batch_size, ) expr_datasets[j] = curr_ds + bias panoramas_i[0].append(j) # Merge two panoramas together. else: curr_ds = np.concatenate([datasets[p] for p in panoramas_i[0]]) curr_ref = np.concatenate([datasets[p] for p in panoramas_j[0]]) # Find base indices into each panorama. base_i = 0 for p in panoramas_i[0]: if p == i: break base_i += datasets[p].shape[0] base_j = 0 for p in panoramas_j[0]: if p == j: break base_j += datasets[p].shape[0] # Find matching indices. match = [] base = 0 for p in panoramas_i[0]: if p == i and j < p and (j, p) in matches: match.extend([(b + base, a + base_j) for a, b in matches[(j, p)]]) elif p == i and j > p and (p, j) in matches: match.extend([(a + base, b + base_j) for a, b in matches[(p, j)]]) base += datasets[p].shape[0] base = 0 for p in panoramas_j[0]: if p == j and i < p and (i, p) in matches: match.extend([(a + base_i, b + base) for a, b in matches[(i, p)]]) elif p == j and i > p and (p, i) in matches: match.extend([(b + base_i, a + base) for a, b in matches[(p, i)]]) base += datasets[p].shape[0] ds_ind = [a for a, _ in match] ref_ind = [b for _, b in match] # Apply transformation to entire panorama. bias = transform( curr_ds, curr_ref, ds_ind, ref_ind, sigma=sigma, batch_size=batch_size ) curr_ds += bias base = 0 for p in panoramas_i[0]: n_cells = datasets[p].shape[0] datasets[p] = curr_ds[base : (base + n_cells), :] base += n_cells if not expr_datasets is None: curr_ds = vstack([expr_datasets[p] for p in panoramas_i[0]]) curr_ref = vstack([expr_datasets[p] for p in panoramas_j[0]]) bias = transform( curr_ds, curr_ref, ds_ind, ref_ind, sigma=sigma, cn=True, batch_size=batch_size, ) curr_ds += bias base = 0 for p in panoramas_i[0]: n_cells = expr_datasets[p].shape[0] expr_datasets[p] = curr_ds[base : (base + n_cells), :] base += n_cells # Merge panoramas i and j and delete one. if panoramas_i[0] != panoramas_j[0]: panoramas_i[0] += panoramas_j[0] panoramas.remove(panoramas_j[0]) return datasets
[docs]def interpret_alignments( datasets, expr_datasets, genes, verbose=VERBOSE, knn=KNN, approx=APPROX, alpha=ALPHA, n_permutations=None, metric="manhattan", ): if n_permutations is None: n_permutations = float(len(genes) * 30) alignments, matches = find_alignments( datasets, knn=knn, approx=approx, alpha=alpha, verbose=verbose, metric=metric ) for i, j in alignments: # Compute average bias vector that aligns two datasets together. ds_i = expr_datasets[i] ds_j = expr_datasets[j] if i < j: match = matches[(i, j)] else: match = matches[(j, i)] i_ind = [a for a, _ in match] j_ind = [b for _, b in match] avg_bias = np.absolute(np.mean(ds_j[j_ind, :] - ds_i[i_ind, :], axis=0)) # Construct null distribution and compute p-value. null_bias = ( ds_j[np.random.randint(ds_j.shape[0], size=n_permutations), :] - ds_i[np.random.randint(ds_i.shape[0], size=n_permutations), :] ) p = ( np.sum( np.greater_equal( np.absolute(np.tile(avg_bias, (n_permutations, 1))), np.absolute(null_bias), ), axis=0, dtype=float, ) + 1 ) / (n_permutations + 1) print(">>>> Stats for alignment {}".format((i, j))) for k in range(len(p)): print("{}\t{}".format(genes[k], p[k]))