From bf80e805e7cbf73bff4e4b75df0904a0afb55b79 Mon Sep 17 00:00:00 2001
From: bastien-mva <bastien.batardiere@gmail.com>
Date: Mon, 3 Apr 2023 09:20:46 +0200
Subject: [PATCH 1/2] add vizualization with circles of confidence.

---
 pyPLNmodels/VEM.py    | 22 ++++++++++++++++------
 pyPLNmodels/_utils.py | 29 +++++++++++++++++++++++++++++
 tests/utils.py        | 21 ++++++++++++++-------
 3 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/pyPLNmodels/VEM.py b/pyPLNmodels/VEM.py
index 0e210990..2dcbcc98 100644
--- a/pyPLNmodels/VEM.py
+++ b/pyPLNmodels/VEM.py
@@ -22,6 +22,7 @@ from ._utils import (
     check_parameters_shape,
     extract_cov_offsets_offsetsformula,
     nice_string_of_dict,
+    plot_ellipse,
 )
 
 if torch.cuda.is_available():
@@ -663,7 +664,9 @@ class _PLNPCA(_PLN):
     def latent_variables(self):
         return torch.matmul(self._M, self._C.T).detach()
 
-    def get_projected_latent_variables(self, nb_dim):
+    def get_projected_latent_variables(self, nb_dim=None):
+        if nb_dim is None:
+            nb_dim = self._rank
         if nb_dim > self._rank:
             raise AttributeError(
                 f"The number of dimension {nb_dim} is larger than the rank {self._rank}"
@@ -671,7 +674,9 @@ class _PLNPCA(_PLN):
         ortho_C = torch.linalg.qr(self._C, "reduced")[0]
         return torch.mm(self.latent_variables, ortho_C[:, :nb_dim]).detach()
 
-    def get_pca_projected_latent_variables(self, nb_dim):
+    def get_pca_projected_latent_variables(self, nb_dim=None):
+        if nb_dim is None:
+            nb_dim = self.rank
         pca = PCA(n_components=nb_dim)
         return pca.fit_transform(self.latent_variables.cpu())
 
@@ -689,12 +694,17 @@ class _PLNPCA(_PLN):
         return self._C
 
     def viz(self, ax=None, color=None, label=None, label_of_colors=None):
+        if self._rank != 2:
+            raise RuntimeError("Can not perform visualization for rank != 2.")
         if ax is None:
             ax = plt.gca()
-        proj_variables = self.get_projected_latent_variables(nb_dim=2)
-        x = proj_variables[:, 0].cpu()
-        y = proj_variables[:, 1].cpu()
-        sns.scatterplot(x=x, y=y, hue=color, ax=ax)
+        proj_variables = self.get_projected_latent_variables()
+        xs = proj_variables[:, 0].cpu().numpy()
+        ys = proj_variables[:, 1].cpu().numpy()
+        sns.scatterplot(x=xs, y=ys, hue=color, ax=ax)
+        covariances = torch.diag_embed(self._S**2).detach()
+        for i in range(covariances.shape[0]):
+            plot_ellipse(xs[i], ys[i], cov=covariances[i], ax=ax)
         return ax
 
 
diff --git a/pyPLNmodels/_utils.py b/pyPLNmodels/_utils.py
index b2b60638..6255aedf 100644
--- a/pyPLNmodels/_utils.py
+++ b/pyPLNmodels/_utils.py
@@ -6,6 +6,9 @@ import numpy as np
 import torch
 import torch.linalg as TLA
 import pandas as pd
+from matplotlib.patches import Ellipse
+import matplotlib.transforms as transforms
+
 
 torch.set_default_dtype(torch.float64)
 
@@ -388,3 +391,29 @@ def nice_string_of_dict(dictionnary):
             return_string += f"{str(element):>10}"
         return_string += "\n"
     return return_string
+
+
+def plot_ellipse(mean_x, mean_y, cov, ax):
+    pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])
+    ell_radius_x = np.sqrt(1 + pearson)
+    ell_radius_y = np.sqrt(1 - pearson)
+    ellipse = Ellipse(
+        (0, 0),
+        width=ell_radius_x * 2,
+        height=ell_radius_y * 2,
+        linestyle="--",
+        alpha=0.1,
+    )
+
+    scale_x = np.sqrt(cov[0, 0])
+    scale_y = np.sqrt(cov[1, 1])
+    transf = (
+        transforms.Affine2D()
+        .rotate_deg(45)
+        .scale(scale_x, scale_y)
+        .translate(mean_x, mean_y)
+    )
+
+    ellipse.set_transform(transf + ax.transData)
+    ax.add_patch(ellipse)
+    return pearson
diff --git a/tests/utils.py b/tests/utils.py
index 51bd0d49..ea97306f 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -23,7 +23,7 @@ def get_simulated_data():
     return Y, covariates, O, true_Sigma, true_beta
 
 
-def get_real_data(take_oaks=True, max_class=5, max_n=200, max_dim=100):
+def get_real_data(take_oaks=True, max_class=5, max_n=500, max_dim=20):
     if take_oaks is True:
         Y = pd.read_csv("../example_data/real_data/oaks_counts.csv")
         n, p = Y.shape
@@ -32,21 +32,28 @@ def get_real_data(take_oaks=True, max_class=5, max_n=200, max_dim=100):
         return Y, covariates, O
     else:
         data = scanpy.read_h5ad(
-            "../example_data/real_data/2k_cell_per_study_10studies.h5ad"
+            "example_data/real_data/2k_cell_per_study_10studies.h5ad"
         )
         Y = data.X.toarray()[:max_n]
-        GT = data.obs["standard_true_celltype_v5"][:max_n]
+        GT_name = data.obs["standard_true_celltype_v5"][:max_n]
         le = LabelEncoder()
-        GT = le.fit_transform(GT)
+        GT = le.fit_transform(GT_name)
         filter = GT < max_class
-        GT = GT[filter]
-        Y = Y[filter]
+        unique, index = np.unique(GT, return_counts=True)
+        enough_elem = index > 15
+        classes_with_enough_elem = unique[enough_elem]
+        filter_bis = np.isin(GT, classes_with_enough_elem)
+        mask = filter * filter_bis
+        GT = GT[mask]
+        GT_name = GT_name[mask]
+        Y = Y[mask]
+        GT = le.fit_transform(GT)
         not_only_zeros = np.sum(Y, axis=0) > 0
         Y = Y[:, not_only_zeros]
         var = np.var(Y, axis=0)
         most_variables = np.argsort(var)[-max_dim:]
         Y = Y[:, most_variables]
-        return Y, GT
+        return Y, GT, list(GT_name.values.__array__())
 
 
 def MSE(t):
-- 
GitLab


From 47ffa5dd036388ba4d6a4ffe9506c88ceee052de Mon Sep 17 00:00:00 2001
From: bastien-mva <bastien.batardiere@gmail.com>
Date: Mon, 3 Apr 2023 09:24:23 +0200
Subject: [PATCH 2/2] change versions

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9b07566b..19511c94 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from setuptools import setup, find_packages
 
-VERSION = "0.0.33"
+VERSION = "0.0.34"
 
 with open("README.md", "r") as fh:
     long_description = fh.read()
-- 
GitLab