{
  "nbformat_minor": 0, 
  "nbformat": 4, 
  "cells": [
    {
      "execution_count": null, 
      "cell_type": "code", 
      "source": [
        "%matplotlib inline"
      ], 
      "outputs": [], 
      "metadata": {
        "collapsed": false
      }
    }, 
    {
      "source": [
        "\n# Above-average features in Boston\n\n\nExplore above-average neighborhood characteristics in the Boston dataset.\n\nHere we take some features correlated with house price, and look at the\ndistribution of median house price when each of these features is above\naverage.\n\nThe most correlated features are:\n\nZN\n    proportion of residential land zoned for lots over 25,000 sq.ft.\nCHAS\n    Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\nRM\n    average number of rooms per dwelling\nDIS\n    weighted distances to five Boston employment centres\nB\n    1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n\nThis kind of dataset analysis may not be a practical use of UpSet, but helps\nto illustrate the :meth:`UpSet.add_catplot` feature.\n\n"
      ], 
      "cell_type": "markdown", 
      "metadata": {}
    }, 
    {
      "execution_count": null, 
      "cell_type": "code", 
      "source": [
        "import pandas as pd\nfrom sklearn.datasets import load_boston\nfrom matplotlib import pyplot as plt\nfrom upsetplot import UpSet\n\n# Load the dataset into a DataFrame\nboston = load_boston()\nboston_df = pd.DataFrame(boston.data, columns=boston.feature_names)\n\n# Get five features most correlated with median house value\ncorrels = boston_df.corrwith(pd.Series(boston.target),\n                             method='spearman').sort_values()\ntop_features = correls.index[-5:]\n\n# Get a binary indicator of whether each top feature is above average\nboston_above_avg = boston_df > boston_df.median(axis=0)\nboston_above_avg = boston_above_avg[top_features]\nboston_above_avg = boston_above_avg.rename(columns=lambda x: x + '>')\n\n# Make this indicator mask an index of boston_df\nboston_df = pd.concat([boston_df, boston_above_avg],\n                      axis=1)\nboston_df = boston_df.set_index(list(boston_above_avg.columns))\n\n# Also give us access to the target (median house value)\nboston_df = boston_df.assign(median_value=boston.target)"
      ], 
      "outputs": [], 
      "metadata": {
        "collapsed": false
      }
    }, 
    {
      "execution_count": null, 
      "cell_type": "code", 
      "source": [
        "# UpSet plot it!\nupset = UpSet(boston_df, subset_size='count', intersection_plot_elements=3)\nupset.add_catplot(value='median_value', kind='strip', color='blue')\nupset.add_catplot(value='AGE', kind='strip', color='black')\nupset.plot()\nplt.title(\"UpSet with catplots, for orientation='horizontal'\")\nplt.show()"
      ], 
      "outputs": [], 
      "metadata": {
        "collapsed": false
      }
    }, 
    {
      "execution_count": null, 
      "cell_type": "code", 
      "source": [
        "# And again in vertical orientation\n\nupset = UpSet(boston_df, subset_size='count', intersection_plot_elements=3,\n              orientation='vertical')\nupset.add_catplot(value='median_value', kind='strip', color='blue')\nupset.add_catplot(value='AGE', kind='strip', color='black')\nupset.plot()\nplt.title(\"UpSet with catplots, for orientation='vertical'\")\nplt.show()"
      ], 
      "outputs": [], 
      "metadata": {
        "collapsed": false
      }
    }
  ], 
  "metadata": {
    "kernelspec": {
      "display_name": "Python 2", 
      "name": "python2", 
      "language": "python"
    }, 
    "language_info": {
      "mimetype": "text/x-python", 
      "nbconvert_exporter": "python", 
      "name": "python", 
      "file_extension": ".py", 
      "version": "2.7.18", 
      "pygments_lexer": "ipython2", 
      "codemirror_mode": {
        "version": 2, 
        "name": "ipython"
      }
    }
  }
}