from concurrent.futures import ProcessPoolExecutor, as_completed
from collections import OrderedDict
import anndata
import joblib
import leidenalg
import numpy as np
import pandas as pd
from imblearn.ensemble import BalancedRandomForestClassifier
from natsort import natsorted
from scanpy.neighbors import Neighbors
from sklearn.metrics import (
confusion_matrix,
pairwise_distances,
balanced_accuracy_score,
adjusted_rand_score,
)
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import cross_val_predict
from ..plot import categorical_scatter
[docs]def _r1_normalize(cmat):
"""
Adapted from https://github.com/SCCAF/sccaf/blob/develop/SCCAF/__init__.py
Normalize the confusion matrix based on the total number of cells in each class
x(i,j) = max(cmat(i,j)/diagnol(i),cmat(j,i)/diagnol(j))
confusion rate between i and j is defined by the maximum ratio i is confused as j or j is confused as i.
Input
cmat: the confusion matrix
return
-----
the normalized confusion matrix
"""
dmat = cmat
smat = np.diag(dmat) + 1 # in case some label has no correct prediction (0 in diag)
dim = cmat.shape[0]
xmat = np.zeros([dim, dim])
for i in range(dim):
for j in range(i + 1, dim):
xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[j], dmat[j, i] / smat[i])
# scale matrix to 0-1
xmat = xmat / np.max(xmat)
return xmat
[docs]def _r2_normalize(cmat):
"""
Adapted from https://github.com/SCCAF/sccaf/blob/develop/SCCAF/__init__.py
Normalize the confusion matrix based on the total number of cells.
x(i,j) = max(cmat(i,j)+cmat(j,i)/N)
N is total number of cells analyzed.
Confusion rate between i and j is defined by the sum of i confused as j or j confused as i.
Then divide by total number of cells.
Input
cmat: the confusion matrix
return
-----
the normalized confusion matrix
"""
emat = np.copy(cmat)
s = np.sum(cmat)
emat = emat + emat.T
np.fill_diagonal(emat, 0)
emat = emat * 1.0 / s
# scale matrix to 0-1
emat = emat / np.max(emat)
return emat
[docs]def _leiden_runner(g, random_states, partition_type, **partition_kwargs):
"""run leiden clustering len(random_states) times with different random states,
return all clusters as a pd.DataFrame"""
results = []
for seed in random_states:
part = leidenalg.find_partition(
g, partition_type, seed=seed, **partition_kwargs
)
groups = np.array(part.membership)
groups = pd.Categorical(
values=groups.astype("U"),
categories=natsorted(np.unique(groups).astype("U")),
)
results.append(groups)
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
result_df = pd.DataFrame(results, columns=random_states)
return result_df
[docs]def _split_train_test_per_group(x, y, frac, max_train, random_state):
"""Split train test for each cluster and make sure there are enough cells for train"""
y_series = pd.Series(y)
# split train test per group
train_idx = []
test_idx = []
outlier_idx = []
for cluster, sub_series in y_series.groupby(y_series):
if (cluster == -1) or (sub_series.size < 3):
outlier_idx += sub_series.index.tolist()
else:
n_train = max(1, min(max_train, int(sub_series.size * frac)))
is_train = sub_series.index.isin(
sub_series.sample(n_train, random_state=random_state).index
)
train_idx += sub_series.index[is_train].tolist()
test_idx += sub_series.index[~is_train].tolist()
x_train = x[train_idx]
y_train = y[train_idx]
x_test = x[test_idx]
y_test = y[test_idx]
return x_train, y_train, x_test, y_test
[docs]def single_supervise_evaluation(
clf, x_train, y_train, x_test, y_test, r1_norm_step=0.05, r2_norm_step=0.05
):
"""A single fit and merge cluster step"""
# fit model
clf.fit(x_train, y_train)
# calc accuracy
y_train_pred = clf.predict(x_train)
accuracy_train = balanced_accuracy_score(y_true=y_train, y_pred=y_train_pred)
print(f"Balanced accuracy on the training set: {accuracy_train:.3f}")
y_test_pred = clf.predict(x_test)
accuracy_test = balanced_accuracy_score(y_true=y_test, y_pred=y_test_pred)
print(f"Balanced accuracy on the hold-out set: {accuracy_test:.3f}")
# get confusion matrix
y_pred = clf.predict(x_test)
cmat = confusion_matrix(y_test, y_pred)
# normalize confusion matrix
r1_cmat = _r1_normalize(cmat)
r2_cmat = _r2_normalize(cmat)
m1 = np.max(r1_cmat)
if np.isnan(m1):
m1 = 1.0
m2 = np.max(r2_cmat)
cluster_map = {}
while (len(cluster_map) == 0) and (m1 > 0) and (m2 > 0):
m1 -= r1_norm_step
m2 -= r2_norm_step
# final binary matrix to calculate which clusters need to be merged
judge = np.maximum.reduce([(r1_cmat > m1), (r2_cmat > m2)])
if judge.sum() > 0:
rows, cols = np.where(judge)
edges = zip(rows.tolist(), cols.tolist())
g = nx.Graph()
g.add_edges_from(edges)
for comp in nx.connected_components(g):
to_label = comp.pop()
for remain in comp:
cluster_map[remain] = to_label
return clf, accuracy_test, cluster_map, cmat, r1_cmat, r2_cmat
[docs]class ConsensusClustering:
def __init__(
self,
model=None,
n_neighbors=25,
metric="euclidean",
min_cluster_size=10,
leiden_repeats=200,
leiden_resolution=1,
target_accuracy=0.95,
consensus_rate=0.7,
random_state=0,
train_frac=0.5,
train_max_n=500,
max_iter=50,
n_jobs=-1,
):
"""
Perform consensus clustering by multi-leiden clustering + supervised model evaluation.
Parameters
----------
model
A supervised ML model, if not provided, will use the BalancedRandomForestClassifier from imblearn
n_neighbors
K of the KNN graph
metric
metric of the KNN graph
min_cluster_size
Minimum final cluster size to report
consensus_rate
Cutoff for the initial cluster separation from multi-leiden run
leiden_repeats
Repeat leiden clustering with different random states this number of times
leiden_resolution
Resolution parameter of leiden clustering
random_state
Overall random state to assure reproducibility
train_frac
fraction of cells per cluster used in training supervised model
train_max_n
maximum number of cells per cluster used in training supervised model, this is to prevent some
large cluster dominant the training dataset. The actual cells used in training is the smaller one
of `train_max_n` and `train_frac * cluster_size`
max_iter
maximum iteration of the cluster merge process
n_jobs
number of cpus
"""
# input metrics
self.min_cluster_size = min_cluster_size
self.consensus_rate = consensus_rate # this prevents merging gradient clusters
self.leiden_repeats = leiden_repeats
self.leiden_resolution = leiden_resolution
self.random_state = random_state
self.n_jobs = n_jobs
self.n_neighbors = n_neighbors
self.knn_metric = metric
self.train_frac = train_frac
self.train_max_n = train_max_n
self.max_iter = max_iter
self.n_obs, self.n_pcs = None, None
self.X = None
self._neighbors = None
self.step_data = OrderedDict()
self.target_accuracy = target_accuracy
# multiple leiden clustering
self.leiden_result_df = None
self._multi_leiden_clusters = None
# model training and outlier rescue
self.supervise_model = model
self._label_with_leiden_outliers = None
self.label = None
self.label_proba = None
self.cv_predicted_label = None
self.final_accuracy = None
return
[docs] def add_data(self, x):
# just add X, but not doing computation
# use this for step-by-step computation
self.n_obs, self.n_pcs = x.shape
self.X = x
[docs] def fit_predict(self, x, leiden_kwds=None):
self.add_data(x)
# Construct KNN graph
print("Computing nearest neighbor graph")
self.compute_neighbors()
# repeat Leiden clustering with different random seeds
# summarize the results and determine initial clusters by hamming distance between leiden runs
print("Computing multiple clustering with different random seeds")
kwds = {}
if leiden_kwds is None:
leiden_kwds = kwds
else:
leiden_kwds.update(kwds)
self.multi_leiden_clustering(**leiden_kwds)
# merge the over clustering version by supervised learning
self.supervise_learning()
# assign outliers
self.final_evaluation()
[docs] def compute_neighbors(self):
"""Calculate KNN graph"""
# nearest neighbors graph
adata = anndata.AnnData(
X=None,
obs=pd.DataFrame([], index=[f"obs{i}" for i in range(self.n_obs)]),
var=pd.DataFrame([], index=[f"var{i}" for i in range(self.n_pcs)]),
)
adata.obsm["X_pca"] = self.X
# here neighbors should only use PCs
self._neighbors = Neighbors(adata=adata)
self._neighbors.compute_neighbors(
n_neighbors=self.n_neighbors,
knn=True,
n_pcs=self.n_pcs,
use_rep="X_pca",
method="umap",
metric=self.knn_metric,
random_state=self.random_state,
)
return
[docs] def multi_leiden_clustering(
self,
partition_type=None,
partition_kwargs=None,
use_weights=True,
n_iterations=-1,
):
"""Modified from scanpy, perform Leiden clustering multiple times with different random states"""
if self._neighbors is None:
raise ValueError(
"Run compute_neighbors first before multi_leiden_clustering"
)
# convert neighbors to igraph
g = self._neighbors.to_igraph()
# generate n different seeds for each single leiden partition
np.random.seed(self.random_state)
leiden_repeats = self.leiden_repeats
n_jobs = self.n_jobs
random_states = np.random.choice(
range(99999), size=leiden_repeats, replace=False
)
step = max(int(leiden_repeats / n_jobs), 10)
random_state_chunks = [
random_states[i: min(i + step, leiden_repeats)]
for i in range(0, leiden_repeats, step)
]
results = []
print(f"Repeating leiden clustering {leiden_repeats} times")
with ProcessPoolExecutor(max_workers=n_jobs) as executor:
future_dict = {}
for i, random_state_chunk in enumerate(random_state_chunks):
# flip to the default partition type if not over writen by the user
if partition_type is None:
partition_type = leidenalg.RBConfigurationVertexPartition
# prepare find_partition arguments as a dictionary, appending to whatever the user provided
# it needs to be this way as this allows for the accounting of a None resolution
# (in the case of a partition variant that doesn't take it on input)
if partition_kwargs is None:
partition_kwargs = {}
else:
if "seed" in partition_kwargs:
print(
"Warning: seed in the partition_kwargs will be ignored, use seed instead."
)
del partition_kwargs["seed"]
if use_weights:
partition_kwargs["weights"] = np.array(g.es["weight"]).astype(
np.float64
)
partition_kwargs["n_iterations"] = n_iterations
partition_kwargs["resolution_parameter"] = self.leiden_resolution
# clustering proper
future = executor.submit(
_leiden_runner,
g=g,
random_states=random_state_chunk,
partition_type=partition_type,
**partition_kwargs,
)
future_dict[future] = random_state_chunks
for future in as_completed(future_dict):
_ = future_dict[future]
try:
data = future.result()
results.append(data)
except Exception as exc:
print(f"_leiden_runner generated an exception: {exc}")
raise exc
total_result = pd.concat(results, axis=1, sort=True)
self.leiden_result_df = total_result
cluster_count = self.leiden_result_df.apply(lambda i: i.unique().size)
print(
f"Found {cluster_count.min()} - {cluster_count.max()} clusters, "
f"mean {cluster_count.mean():.1f}, std {cluster_count.std():.2f}"
)
# create a over-clustering version based on all the leiden runs
print("Summarizing multiple clustering results")
self._summarize_multi_leiden()
return
[docs] def _summarize_multi_leiden(self):
"""Summarize the multi_leiden results,
generate a raw cluster version simply based on the hamming distance
between cells and split cluster with cutoff (consensus_rate)"""
# data: row is leiden run, column is cell
data = self.leiden_result_df.T
# group cell into raw clusters if their hamming distance < 1 - consensus_rate
cur_cluster_id = 0
clusters = {}
while data.shape[1] > 1:
seed_cell = data.pop(data.columns[0])
distance = pairwise_distances(
X=data.T, Y=seed_cell.values[None, :], metric="hamming"
).ravel()
judge = distance < (1 - self.consensus_rate)
this_cluster_cells = [seed_cell.name] + data.columns[judge].to_list()
for cell in this_cluster_cells:
clusters[cell] = cur_cluster_id
data = data.loc[:, ~judge].copy()
cur_cluster_id += 1
if data.shape[1] == 1:
# if there is only one cell remain
clusters[data.columns[0]] = cur_cluster_id
clusters = pd.Series(clusters).sort_index()
# renumber clusters based on cluster size
counts = clusters.value_counts()
cluster_map = {c: i for i, c in enumerate(counts.index)}
clusters = clusters.map(cluster_map)
# renumber small clusters as -1
counts = clusters.value_counts()
small_clusters = counts[counts < self.min_cluster_size].index
clusters[clusters.isin(small_clusters)] = -1
print(
f"{(clusters != -1).sum()} cells assigned to {clusters.unique().size - 1} raw clusters"
)
print(f"{(clusters == -1).sum()} cells are multi-leiden outliers")
self._multi_leiden_clusters = clusters.values
return
[docs] def _create_model(self, n_estimators=1000):
"""Init default model"""
clf = BalancedRandomForestClassifier(
n_estimators=n_estimators,
criterion="gini",
max_depth=None,
min_samples_split=2,
min_samples_leaf=2,
min_weight_fraction_leaf=0.0,
max_features="auto",
max_leaf_nodes=None,
min_impurity_decrease=0.0,
bootstrap=True,
oob_score=False,
sampling_strategy="auto",
replacement=False,
n_jobs=self.n_jobs,
random_state=self.random_state,
verbose=0,
warm_start=False,
class_weight=None,
)
return clf
[docs] def supervise_learning(self):
"""Perform supervised learning and cluster merge process"""
if self._multi_leiden_clusters is None:
raise ValueError(
"Run multi_leiden_clustering first to get a "
"clustering assignment before run supervise_learning."
)
n_cluster = np.unique(
self._multi_leiden_clusters[self._multi_leiden_clusters != -1]
).size
if n_cluster == 1:
print(
"There is only one cluster except for outliers, can not train supervise model on that."
)
self.label = np.zeros(self.n_obs, dtype=int)
return
print(f"\n=== Start supervise model training and cluster merging ===")
x = self.X
cur_y = self._multi_leiden_clusters.copy()
score = None
step = 0.1
if self.supervise_model is None:
# create default model if no model provided
clf = self._create_model(n_estimators=500)
else:
clf = self.supervise_model
for cur_iter in range(1, self.max_iter + 1):
print(f"\n=== iteration {cur_iter} ===")
n_labels = np.unique(cur_y[cur_y != -1]).size
print(f"{n_labels} non-outlier labels")
if n_labels < 2:
print(f"Stop iteration because only {n_labels} cluster remain.")
break
x_train, y_train, x_test, y_test = _split_train_test_per_group(
x=x,
y=cur_y,
frac=self.train_frac,
max_train=self.train_max_n,
random_state=self.random_state
+ cur_iter, # every time train-test split got a different random state
)
(
clf,
score,
cluster_map,
cmat,
r1_cmat,
r2_cmat,
) = single_supervise_evaluation(
clf,
x_train,
y_train,
x_test,
y_test,
r1_norm_step=step,
r2_norm_step=step,
)
step = min(0.2, max(0.05, 2 * (self.target_accuracy - score)))
# save step data for plotting
self.step_data[cur_iter] = [
cur_y,
cmat,
r1_cmat,
r2_cmat,
cluster_map,
score,
step,
]
if score > self.target_accuracy:
print(
f"Stop iteration because current accuracy {score:.3f}"
f" > target accuracy {self.target_accuracy:.3f}."
)
break
# judge results
if len(cluster_map) > 0:
print(f"Merging {len(cluster_map)} clusters.")
cur_y = pd.Series(cur_y).apply(
lambda i: cluster_map[i] if i in cluster_map else i
)
# renumber labels from large to small
ordered_map = {
c: i for i, c in enumerate(cur_y[cur_y != -1].value_counts().index)
}
cur_y = (
pd.Series(cur_y)
.apply(lambda i: ordered_map[i] if i in ordered_map else i)
.values
)
else:
print("Stop iteration because there is no cluster to merge")
break
else:
print("Stop iteration because reaching maximum iteration.")
self._label_with_leiden_outliers = cur_y
self.label = cur_y
self.supervise_model = clf
self.final_accuracy = score
return
[docs] def final_evaluation(self):
"""Final evaluation of the model and assign outliers"""
print(f"\n=== Assign final labels ===")
# skip if there is only one cluster
n_cluster = len(set(self.label[self.label != -1]))
if n_cluster < 2:
print(
f"Skip final evaluation because only {n_cluster} cluster label exist."
)
# name all cluster as c0
self.label = np.zeros(self.label.size, dtype=int)
self.cv_predicted_label = [f"c{label}" for label in self.label]
self.label_proba = np.ones(self.label.size, dtype=int)
self.final_accuracy = 1
else:
# predict outliers
outlier_x = self.X[self.label == -1]
outlier_idx = np.where(self.label == -1)[0]
if len(outlier_idx) != 0:
outlier_predict = pd.Series(
self.supervise_model.predict(outlier_x), index=outlier_idx
)
for cell, pred_label in outlier_predict.items():
self.label[cell] = pred_label
print(
f"Assigned all the multi-leiden clustering outliers into clusters "
f"using the prediction model from final clustering version."
)
# final evaluation of non-outliers using cross val predict
final_predict_proba = cross_val_predict(
self.supervise_model,
self.X,
y=self.label,
method="predict_proba",
n_jobs=self.n_jobs,
verbose=0,
cv=10,
)
final_predict = pd.Series(np.argmax(final_predict_proba, axis=1))
final_cell_proba = pd.Series(np.max(final_predict_proba, axis=1))
final_acc = balanced_accuracy_score(self.label, final_predict.values)
print(f"Final ten-fold CV Accuracy on all the cells: {final_acc:.3f}")
self.cv_predicted_label = [f"c{label}" for label in final_predict]
self.label_proba = final_cell_proba.values
self.final_accuracy = final_acc
self.label = [f"c{label}" for label in self.label]
return
[docs] def save(self, output_path):
"""Save the model"""
joblib.dump(self, output_path)
[docs] def plot_leiden_cases(
self, coord_data, coord_base="umap", plot_size=3, dpi=300, plot_n_cases=4, s=3
):
"""Show some leiden runs with biggest different as measured by ARI"""
# choose some most different leiden runs by rand index
sample_cells = min(1000, self.leiden_result_df.shape[0])
sample_runs = min(30, self.leiden_result_df.shape[1])
use_df = self.leiden_result_df.sample(sample_cells).T.sample(sample_runs)
rand_index_rank = (
pd.DataFrame(
pairwise_distances(use_df, metric=adjusted_rand_score),
index=use_df.index,
columns=use_df.index,
)
.unstack()
.sort_values()
)
plot_cases = set()
for pairs in rand_index_rank[:10].index:
plot_cases.add(pairs[0])
plot_cases.add(pairs[1])
if len(plot_cases) > plot_n_cases:
break
plot_cases = list(plot_cases)[:plot_n_cases]
# plot
plot_data = coord_data.copy()
fig, axes = plt.subplots(
figsize=(plot_n_cases * plot_size, plot_size), ncols=plot_n_cases, dpi=dpi
)
for case, ax in zip(plot_cases, axes):
plot_data[f"Leiden {case}"] = self.leiden_result_df[case].values
categorical_scatter(
ax=ax, data=plot_data, coord_base=coord_base, hue=f"Leiden {case}", s=s
)
return fig, axes
[docs] def plot_before_after(self, coord_data, coord_base="umap", plot_size=3, dpi=300):
"""Plot the raw clusters from multi-leiden and final clusters after merge"""
if len(self.step_data) == 0:
print("No merge step to plot")
return
plot_data = coord_data.copy()
# initial clusters
cur_y, *_ = self.step_data[1]
fig, axes = plt.subplots(figsize=(2 * plot_size, plot_size), ncols=2, dpi=dpi)
ax = axes[0]
plot_data["cur_y"] = cur_y
_ = categorical_scatter(
data=plot_data,
ax=ax,
hue="cur_y",
coord_base=coord_base,
palette="tab20",
text_anno="cur_y",
show_legend=False,
)
ax.set(title="Initial Labels From\nMulti-leiden Clustering")
ax = axes[1]
plot_data["cur_y"] = self.label
_ = categorical_scatter(
data=plot_data,
ax=ax,
hue="cur_y",
coord_base=coord_base,
palette="tab20",
text_anno="cur_y",
show_legend=False,
)
ax.set(title="Final Labels After Merge")
return
[docs] def plot_steps(self, coord_data, coord_base="umap", plot_size=3, dpi=300):
"""Plot the supervised learning and merge steps"""
if len(self.step_data) == 0:
print("No merge step to plot")
return
plot_data = coord_data.copy()
self.plot_before_after(
coord_data, coord_base=coord_base, plot_size=plot_size, dpi=dpi
)
# initial clusters
for i, step in enumerate(sorted(self.step_data.keys())):
(
cur_y,
cmat,
r1_cmat,
r2_cmat,
cluster_map,
score,
step_size,
) = self.step_data[step]
fig, axes = plt.subplots(
figsize=(4 * plot_size, plot_size), ncols=4, dpi=dpi
)
ax = axes[0]
sns.heatmap(ax=ax, data=cmat, cbar=None, cmap="Reds")
ax.set(title="Confusion Matrix", ylabel=f"Step {step}")
ax = axes[1]
sns.heatmap(ax=ax, data=r1_cmat, cbar=None, cmap="Reds")
ax.set(title="R1 Norm.")
ax = axes[2]
sns.heatmap(ax=ax, data=r2_cmat, cbar=None, cmap="Reds")
ax.set(title="R2 Norm.")
ax = axes[3]
if len(cluster_map) > 0:
involved_clusters = set(
list(cluster_map.keys()) + list(cluster_map.values())
)
cur_y = pd.Series(cur_y, index=plot_data.index)
cur_y = cur_y.apply(lambda i: cluster_map[i] if i in cluster_map else i)
# if not involved, mark as -1
cur_y = cur_y.apply(lambda i: i if i in involved_clusters else -1)
plot_data["cur_y"] = cur_y
n_color = cur_y.unique().size
colors = list(sns.color_palette("tab10", n_color))
cmap = {
c: colors.pop() if c != -1 else (0.9, 0.9, 0.9)
for c in cur_y.unique()
}
categorical_scatter(
ax=ax,
data=plot_data,
coord_base=coord_base,
hue="cur_y",
palette=cmap,
s=3,
)
ax.set(title=f"Step {step} Cells After Merge")
else:
ax.axis("off")
return
[docs] def plot_merge_process(self, plot_size=3):
"""Plot the change of accuracy during merge"""
if len(self.step_data) == 0:
print("No merge step to plot")
return
plot_data = []
for step, (cur_y, *_, score, _) in self.step_data.items():
cur_n_cluster = len(set(cur_y)) - 1
plot_data.append(
{"Step": step, "# of clusters": cur_n_cluster, "Score": score}
)
plot_data = pd.DataFrame(plot_data)
fig, axes = plt.subplots(
figsize=(plot_size * 2, plot_size),
ncols=2,
dpi=300,
constrained_layout=True,
)
ax = axes[0]
sns.lineplot(ax=ax, data=plot_data, x="Step", y="# of clusters", color="gray")
ax = axes[1]
sns.lineplot(ax=ax, data=plot_data, x="Step", y="Score", color="gray")
ax.axhline(y=self.target_accuracy, color="red", linewidth=0.8, linestyle="--")
ax.set(
ylim=(plot_data["Score"].min() - 0.03, min(1, self.target_accuracy + 0.01))
)
return
[docs]def select_confusion_pairs(true_label, predicted_label, ratio_cutoff=0.001):
"""
Select cluster pairs that are confusing (ratio_cutoff) between true and predicted labels
Parameters
----------
true_label : true cell labels
predicted_label : predicted cell labels
ratio_cutoff : ratio of clusters cutoff to define confusion
Returns
-------
confused_pairs :
list of cluster pair tuples
"""
labels = pd.DataFrame({"true": true_label, "pred": predicted_label})
confusion_matrix = (
labels.groupby("true")["pred"].value_counts().unstack().fillna(0).astype(int)
)
row_sum = confusion_matrix.sum(axis=1)
row_norm = (confusion_matrix / row_sum.values[:, None]).unstack()
row_pairs = row_norm[row_norm > ratio_cutoff].reset_index().iloc[:, :2]
col_sum = confusion_matrix.sum(axis=0)
col_norm = (confusion_matrix / col_sum.values[None, :]).unstack()
col_pairs = col_norm[col_norm > ratio_cutoff].reset_index().iloc[:, :2]
include_pairs = set()
for _, s in pd.concat([row_pairs, col_pairs]).iterrows():
a, b = s.sort_values()
if a == b:
continue
include_pairs.add((a, b))
return list(include_pairs)