From bf80e805e7cbf73bff4e4b75df0904a0afb55b79 Mon Sep 17 00:00:00 2001 From: bastien-mva <bastien.batardiere@gmail.com> Date: Mon, 3 Apr 2023 09:20:46 +0200 Subject: [PATCH 1/2] add vizualization with circles of confidence. --- pyPLNmodels/VEM.py | 22 ++++++++++++++++------ pyPLNmodels/_utils.py | 29 +++++++++++++++++++++++++++++ tests/utils.py | 21 ++++++++++++++------- 3 files changed, 59 insertions(+), 13 deletions(-) diff --git a/pyPLNmodels/VEM.py b/pyPLNmodels/VEM.py index 0e210990..2dcbcc98 100644 --- a/pyPLNmodels/VEM.py +++ b/pyPLNmodels/VEM.py @@ -22,6 +22,7 @@ from ._utils import ( check_parameters_shape, extract_cov_offsets_offsetsformula, nice_string_of_dict, + plot_ellipse, ) if torch.cuda.is_available(): @@ -663,7 +664,9 @@ class _PLNPCA(_PLN): def latent_variables(self): return torch.matmul(self._M, self._C.T).detach() - def get_projected_latent_variables(self, nb_dim): + def get_projected_latent_variables(self, nb_dim=None): + if nb_dim is None: + nb_dim = self._rank if nb_dim > self._rank: raise AttributeError( f"The number of dimension {nb_dim} is larger than the rank {self._rank}" @@ -671,7 +674,9 @@ class _PLNPCA(_PLN): ortho_C = torch.linalg.qr(self._C, "reduced")[0] return torch.mm(self.latent_variables, ortho_C[:, :nb_dim]).detach() - def get_pca_projected_latent_variables(self, nb_dim): + def get_pca_projected_latent_variables(self, nb_dim=None): + if nb_dim is None: + nb_dim = self.rank pca = PCA(n_components=nb_dim) return pca.fit_transform(self.latent_variables.cpu()) @@ -689,12 +694,17 @@ class _PLNPCA(_PLN): return self._C def viz(self, ax=None, color=None, label=None, label_of_colors=None): + if self._rank != 2: + raise RuntimeError("Can not perform visualization for rank != 2.") if ax is None: ax = plt.gca() - proj_variables = self.get_projected_latent_variables(nb_dim=2) - x = proj_variables[:, 0].cpu() - y = proj_variables[:, 1].cpu() - sns.scatterplot(x=x, y=y, hue=color, ax=ax) + proj_variables = self.get_projected_latent_variables() + xs = proj_variables[:, 0].cpu().numpy() + ys = proj_variables[:, 1].cpu().numpy() + sns.scatterplot(x=xs, y=ys, hue=color, ax=ax) + covariances = torch.diag_embed(self._S**2).detach() + for i in range(covariances.shape[0]): + plot_ellipse(xs[i], ys[i], cov=covariances[i], ax=ax) return ax diff --git a/pyPLNmodels/_utils.py b/pyPLNmodels/_utils.py index b2b60638..6255aedf 100644 --- a/pyPLNmodels/_utils.py +++ b/pyPLNmodels/_utils.py @@ -6,6 +6,9 @@ import numpy as np import torch import torch.linalg as TLA import pandas as pd +from matplotlib.patches import Ellipse +import matplotlib.transforms as transforms + torch.set_default_dtype(torch.float64) @@ -388,3 +391,29 @@ def nice_string_of_dict(dictionnary): return_string += f"{str(element):>10}" return_string += "\n" return return_string + + +def plot_ellipse(mean_x, mean_y, cov, ax): + pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1]) + ell_radius_x = np.sqrt(1 + pearson) + ell_radius_y = np.sqrt(1 - pearson) + ellipse = Ellipse( + (0, 0), + width=ell_radius_x * 2, + height=ell_radius_y * 2, + linestyle="--", + alpha=0.1, + ) + + scale_x = np.sqrt(cov[0, 0]) + scale_y = np.sqrt(cov[1, 1]) + transf = ( + transforms.Affine2D() + .rotate_deg(45) + .scale(scale_x, scale_y) + .translate(mean_x, mean_y) + ) + + ellipse.set_transform(transf + ax.transData) + ax.add_patch(ellipse) + return pearson diff --git a/tests/utils.py b/tests/utils.py index 51bd0d49..ea97306f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -23,7 +23,7 @@ def get_simulated_data(): return Y, covariates, O, true_Sigma, true_beta -def get_real_data(take_oaks=True, max_class=5, max_n=200, max_dim=100): +def get_real_data(take_oaks=True, max_class=5, max_n=500, max_dim=20): if take_oaks is True: Y = pd.read_csv("../example_data/real_data/oaks_counts.csv") n, p = Y.shape @@ -32,21 +32,28 @@ def get_real_data(take_oaks=True, max_class=5, max_n=200, max_dim=100): return Y, covariates, O else: data = scanpy.read_h5ad( - "../example_data/real_data/2k_cell_per_study_10studies.h5ad" + "example_data/real_data/2k_cell_per_study_10studies.h5ad" ) Y = data.X.toarray()[:max_n] - GT = data.obs["standard_true_celltype_v5"][:max_n] + GT_name = data.obs["standard_true_celltype_v5"][:max_n] le = LabelEncoder() - GT = le.fit_transform(GT) + GT = le.fit_transform(GT_name) filter = GT < max_class - GT = GT[filter] - Y = Y[filter] + unique, index = np.unique(GT, return_counts=True) + enough_elem = index > 15 + classes_with_enough_elem = unique[enough_elem] + filter_bis = np.isin(GT, classes_with_enough_elem) + mask = filter * filter_bis + GT = GT[mask] + GT_name = GT_name[mask] + Y = Y[mask] + GT = le.fit_transform(GT) not_only_zeros = np.sum(Y, axis=0) > 0 Y = Y[:, not_only_zeros] var = np.var(Y, axis=0) most_variables = np.argsort(var)[-max_dim:] Y = Y[:, most_variables] - return Y, GT + return Y, GT, list(GT_name.values.__array__()) def MSE(t): -- GitLab From 47ffa5dd036388ba4d6a4ffe9506c88ceee052de Mon Sep 17 00:00:00 2001 From: bastien-mva <bastien.batardiere@gmail.com> Date: Mon, 3 Apr 2023 09:24:23 +0200 Subject: [PATCH 2/2] change versions --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9b07566b..19511c94 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from setuptools import setup, find_packages -VERSION = "0.0.33" +VERSION = "0.0.34" with open("README.md", "r") as fh: long_description = fh.read() -- GitLab