from decimal import Decimal
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
[docs]def _density_based_sample(
data: pd.DataFrame, coords: list, portion=None, size=None, seed=None
):
"""down sample data based on density, to prevent overplot in dense region and decrease plotting time"""
clf = LocalOutlierFactor(
n_neighbors=20,
algorithm="auto",
leaf_size=30,
metric="minkowski",
p=2,
metric_params=None,
contamination=0.1,
)
# coords should already exist in data, get them by column names list
data_coords = data[coords]
clf.fit(data_coords)
# original score is negative, the larger the denser
density_score = clf.negative_outlier_factor_
delta = density_score.max() - density_score.min()
# density score to probability: the denser the less probability to be picked up
probability_score = 1 - (density_score - density_score.min()) / delta
probability_score = np.sqrt(probability_score)
probability_score = probability_score / probability_score.sum()
if size is not None:
pass
elif portion is not None:
size = int(data_coords.index.size * portion)
else:
raise ValueError("Either portion or size should be provided.")
if seed is not None:
np.random.seed(seed)
selected_cell_index = np.random.choice(
data_coords.index, size=size, replace=False, p=probability_score
) # choice data based on density weights
# return the down sampled data
return data.reindex(selected_cell_index)
[docs]def _translate_coord_name(coord_name):
return coord_name.upper().replace("_", " ")
[docs]def _make_tiny_axis_label(ax, x, y, arrow_kws=None, fontsize=5):
"""this function assume coord is [0, 1]"""
# clean ax axises
ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None)
sns.despine(ax=ax, left=True, bottom=True)
_arrow_kws = dict(width=0.003, linewidth=0, color="black")
if arrow_kws is not None:
_arrow_kws.update(arrow_kws)
ax.arrow(0.06, 0.06, 0, 0.06, **_arrow_kws, transform=ax.transAxes)
ax.arrow(0.06, 0.06, 0.06, 0, **_arrow_kws, transform=ax.transAxes)
ax.text(
0.06,
0.03,
_translate_coord_name(x),
fontdict=dict(
fontsize=fontsize, horizontalalignment="left", verticalalignment="center"
),
transform=ax.transAxes,
)
ax.text(
0.03,
0.06,
_translate_coord_name(y),
fontdict=dict(
fontsize=fontsize,
rotation=90,
rotation_mode="anchor",
horizontalalignment="left",
verticalalignment="center",
),
transform=ax.transAxes,
)
return
[docs]def zoom_min_max(vmin, vmax, scale):
width = vmax - vmin
width_zoomed = width * scale
delta_value = (width_zoomed - width) / 2
return vmin - delta_value, vmax + delta_value
[docs]def zoom_ax(ax, zoom_scale, on="both"):
on = on.lower()
xlim = ax.get_xlim()
xlim_zoomed = zoom_min_max(*xlim, zoom_scale)
ylim = ax.get_ylim()
ylim_zoomed = zoom_min_max(*ylim, zoom_scale)
if (on == "both") or ("x" in on):
ax.set_xlim(xlim_zoomed)
if (on == "both") or ("y" in on):
ax.set_ylim(ylim_zoomed)
[docs]def add_ax_box(ax, expend=0, **patch_kws):
import matplotlib.patches as patches
_patch_kws = dict(linewidth=1, edgecolor="k", facecolor="none")
_patch_kws.update(patch_kws)
rect = patches.Rectangle(
(0 - expend, 0 - expend),
1 + expend,
1 + expend,
transform=ax.transAxes,
**_patch_kws,
)
# Add the patch to the Axes
ax.add_patch(rect)
return ax
[docs]def tight_hue_range(hue_data, portion):
"""Automatic select a SMALLEST data range that covers [portion] of the data"""
hue_data = hue_data[np.isfinite(hue_data)]
hue_quantiles = hue_data.quantile(q=np.arange(0, 1, 0.01))
min_window_right = (
hue_quantiles.rolling(window=int(portion * 100))
.apply(lambda i: i.max() - i.min(), raw=True)
.idxmin()
)
min_window_left = max(0, min_window_right - portion)
vmin, vmax = tuple(hue_data.quantile(q=[min_window_left, min_window_right]))
if np.isfinite(vmin):
vmin = max(hue_data.min(), vmin)
else:
vmin = hue_data.min()
if np.isfinite(vmax):
vmax = min(hue_data.max(), vmax)
else:
vmax = hue_data.max()
return vmin, vmax