{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Mixture Model\nThis notebook contains the code for a simple implementation of the Leaspy Mixture model on synthetic data.\nBefore implementing the model take a look at the relevant mathematical framework in the user guide.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following imports are required libraries for numerical computation, data manipulation, and visualization.\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nimport torch\n\nimport leaspy\nfrom leaspy.io.data import Data, Dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This toy example is part of a simulation study, carried out by Sofia Kaisaridi that will be included in\nan article to be submitted in a biostatistics journal. The dataset contains 1000 individuals each with 6 visits and 6 scores.\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "leaspy_root = os.path.dirname(leaspy.__file__)\n\ndata_path = os.path.join(leaspy_root, \"datasets/data/simulated_data_for_mixture.csv\")\n\nall_data = pd.read_csv(data_path, sep=\";\", decimal=\",\")\nall_data[\"ID\"] = all_data[\"ID\"].ffill()\nall_data = all_data.set_index([\"ID\", \"TIME\"])\nall_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We load the Mixture Model from the leaspy library and transform the dataset in a leaspy-compatible form with the built-in functions.\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from leaspy.models import LogisticMultivariateMixtureModel\n\nleaspy_data = Data.from_dataframe(all_data)\nleaspy_dataset = Dataset(leaspy_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then we fit a model with 3 clusters and 2 sources. Note that we have an extra argument `n_clusters` than the\nstandard model that has to be specified in order for the mixture model to run.\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "model = LogisticMultivariateMixtureModel(\n name=\"multi\",\n source_dimension=2,\n dimension=6,\n n_clusters=3,\n obs_models=\"gaussian-diagonal\",\n)\n\nmodel.fit(leaspy_dataset, \"mcmc_saem\", seed=1312, n_iter=100, progress_bar=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First we take a look in the population parameters.\nWith the mixture model we obtain separate values for the `tau_mean`, `xi_mean` and the `sources_mean` for each cluster,\nas well as the cluster probabilities (`probs`).\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(model.parameters)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then we can also retrieve the individual parameters and the posteriors probabilities of cluster membership.\nWe then consider that the cluster label is the cluster with the biggest probability.\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from torch.distributions import Normal\n\n\ndef get_ip(df_leaspy, model):\n \"\"\"\n leaspy_data : the dataframe with the correct indexing\n leaspy_mixture : the leaspy object after the fit\n \"\"\"\n ip = pd.DataFrame(df_leaspy.index.get_level_values(\"ID\").unique(), columns=[\"ID\"])\n\n ip[[\"tau\"]] = model.state[\"tau\"]\n ip[[\"xi\"]] = model.state[\"xi\"]\n ip[[\"sources_0\"]] = model.state[\"sources\"][:, 0].cpu().numpy().reshape(-1, 1)\n ip[[\"sources_1\"]] = model.state[\"sources\"][:, 1].cpu().numpy().reshape(-1, 1)\n\n params = model.parameters\n\n probs = params[\"probs\"]\n\n # Number of individuals and clusters\n n = len(ip)\n k = len(probs)\n\n means = {\n \"tau\": params[\"tau_mean\"], # shape: (2,)\n \"xi\": params[\"xi_mean\"],\n \"sources_0\": params[\"sources_mean\"][0, :],\n \"sources_1\": params[\"sources_mean\"][1, :],\n }\n stds = {\n \"tau\": params[\"tau_std\"],\n \"xi\": params[\"xi_std\"],\n \"sources_0\": torch.tensor(1),\n \"sources_1\": torch.tensor(1),\n }\n\n stds[\"sources_0\"] = stds[\"sources_0\"].repeat(k)\n stds[\"sources_1\"] = stds[\"sources_1\"].repeat(k)\n\n values = {\n \"tau\": torch.tensor(ip[\"tau\"].values),\n \"xi\": torch.tensor(ip[\"xi\"].values),\n \"sources_0\": torch.tensor(ip[\"sources_0\"].values),\n \"sources_1\": torch.tensor(ip[\"sources_1\"].values),\n }\n\n # Compute log-likelihoods for each variable\n log_likelihoods = torch.zeros((n, k))\n\n for var in [\"tau\", \"xi\", \"sources_0\", \"sources_1\"]:\n x = torch.tensor(ip[var].values)\n\n for cluster in range(k):\n dist = Normal(means[var][cluster], stds[var][cluster])\n log_likelihoods[:, cluster] += dist.log_prob(x)\n\n # Add log-priors\n log_priors = torch.log(probs)\n log_posteriors = log_likelihoods + log_priors\n\n # Normalize using logsumexp\n log_sum = torch.logsumexp(log_posteriors, dim=1, keepdim=True)\n responsibilities = torch.exp(log_posteriors - log_sum)\n\n for i in range(responsibilities.shape[1]):\n ip[f\"prob_cluster_{i}\"] = responsibilities[:, i].numpy()\n\n # Automatically find all probability columns\n prob_cols = [col for col in ip.columns if col.startswith(\"prob_cluster_\")]\n\n # Assign the most likely cluster\n ip[\"cluster_label\"] = ip[prob_cols].values.argmax(axis=1)\n\n return ip\n\n\nip = get_ip(all_data, model)\nip.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We produce the population progression plots. We separate the model\nparameters for each cluster and we store them to a dedicated item.\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from leaspy.io.outputs import IndividualParameters\n\nparameters = model.parameters\nnumber_of_sources = 2\n\n# cluster 0\nmean_xi = parameters[\"xi_mean\"].numpy()[0]\nmean_tau = parameters[\"tau_mean\"].numpy()[0]\nmean_source = parameters[\"sources_mean\"].numpy()[0, 0].tolist()\nmean_sources = [mean_source] * number_of_sources\n\nparameters_0 = {\"xi\": mean_xi, \"tau\": mean_tau, \"sources\": mean_sources}\n\nip_0 = IndividualParameters()\nip_0.add_individual_parameters(\"average\", parameters_0)\n\n# cluster 1\nmean_xi = parameters[\"xi_mean\"].numpy()[1]\nmean_tau = parameters[\"tau_mean\"].numpy()[1]\nmean_source = parameters[\"sources_mean\"].numpy()[0, 1].tolist()\nmean_sources = [mean_source] * number_of_sources\n\nparameters_1 = {\"xi\": mean_xi, \"tau\": mean_tau, \"sources\": mean_sources}\n\nip_1 = IndividualParameters()\nip_1.add_individual_parameters(\"average\", parameters_1)\n\n# cluster 2\nmean_xi = parameters[\"xi_mean\"].numpy()[2]\nmean_tau = parameters[\"tau_mean\"].numpy()[2]\nmean_source = parameters[\"sources_mean\"].numpy()[0, 2].tolist()\nnumber_of_sources = 2\nmean_sources = [mean_source] * number_of_sources\n\nparameters_2 = {\"xi\": mean_xi, \"tau\": mean_tau, \"sources\": mean_sources}\n\nip_2 = IndividualParameters()\nip_2.add_individual_parameters(\"average\", parameters_2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We separate the scores and we plot two graphs for clarity.\nEach cluster is represented with a different colour. Each score is\nrepresented with a different linestyle. In the first graph we have the\nfirst 3 scores and in the second graph we have the remaining 3 scores.\nIn each graph we plot all the clusters.\n\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from matplotlib.lines import Line2D\n\ntimepoints = np.linspace(15, 115, 105)\nvalues_0 = model.estimate({\"average\": timepoints}, ip_0)\nvalues_1 = model.estimate({\"average\": timepoints}, ip_1)\nvalues_2 = model.estimate({\"average\": timepoints}, ip_2)\n\n# A. Scores 1-3\n\nplt.figure(figsize=(7, 5))\nplt.ylim(0, 1)\nlines = [\n \"-\",\n \"--\",\n \":\",\n]\n## cluster 0\nfor ls, name, val in zip(lines, model.features, values_0[\"average\"][:, 0:3].T):\n plt.plot(timepoints, val, label=f\"cluster_0_{name}\", c=\"red\", linewidth=3, ls=ls)\n\n## cluster 1\nfor ls, name, val in zip(lines, model.features, values_1[\"average\"][:, 0:3].T):\n plt.plot(timepoints, val, label=f\"cluster_1_{name}\", c=\"blue\", linewidth=3, ls=ls)\n\n## cluster 2\nfor ls, name, val in zip(lines, model.features, values_2[\"average\"][:, 0:3].T):\n plt.plot(timepoints, val, label=f\"cluster_2_{name}\", c=\"green\", linewidth=3, ls=ls)\n\n## legends\ncluster_legend = [\n Line2D([0], [0], color=\"red\", linewidth=3, label=\"cluster_0\"),\n Line2D([0], [0], color=\"blue\", linewidth=3, label=\"cluster_1\"),\n Line2D([0], [0], color=\"green\", linewidth=3, label=\"cluster_2\"),\n]\nlegend1 = plt.legend(handles=cluster_legend, loc=\"upper left\", prop={\"size\": 12})\n\ncolor_legend = [\n Line2D([0], [0], linestyle=lines[0], color=\"black\", linewidth=3, label=\"score_1\"),\n Line2D([0], [0], linestyle=lines[1], color=\"black\", linewidth=3, label=\"score_2\"),\n Line2D([0], [0], linestyle=lines[2], color=\"black\", linewidth=3, label=\"score_3\"),\n]\nlegend2 = plt.legend(\n handles=color_legend, loc=\"center left\", title_fontsize=14, prop={\"size\": 12}\n)\n\nplt.gca().add_artist(legend1)\n\nplt.xlim(min(timepoints), max(timepoints))\nplt.xlabel(\"Reparametrized age\", fontsize=14)\nplt.ylabel(\"Normalized feature value\", fontsize=14)\nplt.title(\"scores 1-3\")\nplt.suptitle(\"Population progression\", fontsize=18)\nplt.tight_layout()\nplt.show()\n\n# B. Scores 4-6\n\nplt.figure(figsize=(7, 5))\nplt.ylim(0, 1)\nlines = [\n \"-\",\n \"--\",\n \":\",\n]\n## cluster 0\nfor ls, name, val in zip(lines, model.features, values_0[\"average\"][:, 3:6].T):\n plt.plot(timepoints, val, label=f\"cluster_0_{name}\", c=\"red\", linewidth=3, ls=ls)\n\n## cluster 1\nfor ls, name, val in zip(lines, model.features, values_1[\"average\"][:, 3:6].T):\n plt.plot(timepoints, val, label=f\"cluster_1_{name}\", c=\"blue\", linewidth=3, ls=ls)\n\n## cluster 2\nfor ls, name, val in zip(lines, model.features, values_2[\"average\"][:, 3:6].T):\n plt.plot(timepoints, val, label=f\"cluster_2_{name}\", c=\"green\", linewidth=3, ls=ls)\n\n## legends\ncluster_legend = [\n Line2D([0], [0], color=\"red\", linewidth=3, label=\"cluster_0\"),\n Line2D([0], [0], color=\"blue\", linewidth=3, label=\"cluster_1\"),\n Line2D([0], [0], color=\"green\", linewidth=3, label=\"cluster_2\"),\n]\nlegend1 = plt.legend(handles=cluster_legend, loc=\"upper left\", prop={\"size\": 12})\n\ncolor_legend = [\n Line2D([0], [0], linestyle=lines[0], color=\"black\", linewidth=3, label=\"score_4\"),\n Line2D([0], [0], linestyle=lines[1], color=\"black\", linewidth=3, label=\"score_5\"),\n Line2D([0], [0], linestyle=lines[2], color=\"black\", linewidth=3, label=\"score_6\"),\n]\nlegend2 = plt.legend(\n handles=color_legend, loc=\"lower right\", title_fontsize=14, prop={\"size\": 12}\n)\n\nplt.gca().add_artist(legend1)\n\nplt.xlim(min(timepoints), max(timepoints))\nplt.xlabel(\"Reparametrized age\", fontsize=14)\nplt.ylabel(\"Normalized feature value\", fontsize=14)\nplt.title(\"scores 4-6\")\nplt.suptitle(\"Population progression\", fontsize=18)\nplt.tight_layout()\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.12" } }, "nbformat": 4, "nbformat_minor": 0 }