{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Above-average features in Diabetes\n\nExplore above-average attributes in the Diabetes dataset (Efron et al, 2004).\n\nHere we take some features correlated with disease progression, and look at the\ndistribution of that disease progression value when each of these features is\nabove average.\n\nThe most correlated features are:\n\n  - bmi body mass index\n  - bp average blood pressure\n  - s4 tch, total cholesterol / HDL\n  - s5 ltg, possibly log of serum triglycerides level\n  - s6 glu, blood sugar level\n\nThis kind of dataset analysis may not be a practical use of UpSet, but helps\nto illustrate the :meth:`UpSet.add_catplot` feature.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import pandas as pd\nfrom sklearn.datasets import load_diabetes\nfrom matplotlib import pyplot as plt\nfrom upsetplot import UpSet\n\n# Load the dataset into a DataFrame\ndiabetes = load_diabetes()\ndiabetes_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)\n\n# Get five features most correlated with median house value\ncorrels = diabetes_df.corrwith(pd.Series(diabetes.target),\n                               method='spearman').sort_values()\ntop_features = correls.index[-5:]\n\n# Get a binary indicator of whether each top feature is above average\ndiabetes_above_avg = diabetes_df > diabetes_df.median(axis=0)\ndiabetes_above_avg = diabetes_above_avg[top_features]\ndiabetes_above_avg = diabetes_above_avg.rename(columns=lambda x: x + '>')\n\n# Make this indicator mask an index of diabetes_df\ndiabetes_df = pd.concat([diabetes_df, diabetes_above_avg],\n                        axis=1)\ndiabetes_df = diabetes_df.set_index(list(diabetes_above_avg.columns))\n\n# Also give us access to the target (median house value)\ndiabetes_df = diabetes_df.assign(progression=diabetes.target)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# UpSet plot it!\nupset = UpSet(diabetes_df, subset_size='count', intersection_plot_elements=3)\nupset.add_catplot(value='progression', kind='strip', color='blue')\nprint(diabetes_df)\nupset.add_catplot(value='bmi', kind='strip', color='black')\nupset.plot()\nplt.title(\"UpSet with catplots, for orientation='horizontal'\")\nplt.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# And again in vertical orientation\n\nupset = UpSet(diabetes_df, subset_size='count', intersection_plot_elements=3,\n              orientation='vertical')\nupset.add_catplot(value='progression', kind='strip', color='blue')\nupset.add_catplot(value='bmi', kind='strip', color='black')\nupset.plot()\nplt.title(\"UpSet with catplots, for orientation='vertical'\")\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.15"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}