{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b2a47150",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:23.876611Z",
     "iopub.status.busy": "2021-06-04T02:32:23.876164Z",
     "iopub.status.idle": "2021-06-04T02:32:24.619239Z",
     "shell.execute_reply": "2021-06-04T02:32:24.618546Z"
    }
   },
   "outputs": [],
   "source": [
    "from functools import partial\n",
    "from rpy2.ipython import html\n",
    "html.html_rdataframe=partial(html.html_rdataframe, table_class=\"docutils\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "16d6d4df",
   "metadata": {},
   "source": [
    "# dplyr in Python\n",
    "\n",
    "We need 2 things for this:\n",
    "\n",
    "1- A data frame (using one of R's demo datasets)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ac3b7497",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:24.623430Z",
     "iopub.status.busy": "2021-06-04T02:32:24.623030Z",
     "iopub.status.idle": "2021-06-04T02:32:24.659234Z",
     "shell.execute_reply": "2021-06-04T02:32:24.658333Z"
    }
   },
   "outputs": [],
   "source": [
    "from rpy2.robjects.packages import importr, data\n",
    "datasets = importr('datasets')\n",
    "mtcars_env = data(datasets).fetch('mtcars')\n",
    "mtcars = mtcars_env['mtcars']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f918a2d4",
   "metadata": {},
   "source": [
    "In addition to that, and because this tutorial is in a notebook,\n",
    "we initialize HTML rendering for R objects (pretty display of\n",
    "R data frames)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c1dd64e9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:24.666289Z",
     "iopub.status.busy": "2021-06-04T02:32:24.665235Z",
     "iopub.status.idle": "2021-06-04T02:32:24.666872Z",
     "shell.execute_reply": "2021-06-04T02:32:24.667247Z"
    }
   },
   "outputs": [],
   "source": [
    "import rpy2.ipython.html\n",
    "rpy2.ipython.html.init_printing()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "871c03cd",
   "metadata": {},
   "source": [
    "2- dplyr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "83c97d95",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:24.670782Z",
     "iopub.status.busy": "2021-06-04T02:32:24.670104Z",
     "iopub.status.idle": "2021-06-04T02:32:26.480006Z",
     "shell.execute_reply": "2021-06-04T02:32:26.479089Z"
    }
   },
   "outputs": [],
   "source": [
    "from rpy2.robjects.lib.dplyr import DataFrame\n",
    "from rpy2.robjects import rl"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee7809ae",
   "metadata": {},
   "source": [
    "With this we have the choice of chaining (D3-style)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f73a0d0f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:26.484773Z",
     "iopub.status.busy": "2021-06-04T02:32:26.484204Z",
     "iopub.status.idle": "2021-06-04T02:32:26.512402Z",
     "shell.execute_reply": "2021-06-04T02:32:26.511857Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<emph>DataFrame</emph> with 2 rows and\n",
       "  2 columns:\n",
       "<table class=\"docutils\">\n",
       "  <thead>\n",
       "    <tr class=\"rpy2_names\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>gear</th>\n",
       "      <th>mean_ptw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">0</td>\n",
       "        <td class=\"rpy2_names\">1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">1</td>\n",
       "        <td class=\"rpy2_names\">2</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<rpy2.robjects.lib.dplyr.DataFrame object at 0x7fd307957c80> [RTYPES.VECSXP]\n",
       "R classes: ('tbl_df', 'tbl', 'data.frame')\n",
       "[FloatSexpVector, FloatSexpVector]\n",
       "  gear: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7fd30bf99e80> [RTYPES.REALSXP]\n",
       "  mean_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7fd30bf9a500> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataf = (\n",
    "    DataFrame(mtcars)\n",
    "    .filter(rl('gear>3'))\n",
    "    .mutate(powertoweight=rl('hp*36/wt'))\n",
    "    .group_by(rl('gear'))\n",
    "    .summarize(mean_ptw=rl('mean(powertoweight)'))\n",
    ")\n",
    "\n",
    "dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "93867a72",
   "metadata": {},
   "source": [
    "or with pipes (magrittr style)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f80e005b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:26.517519Z",
     "iopub.status.busy": "2021-06-04T02:32:26.516872Z",
     "iopub.status.idle": "2021-06-04T02:32:26.519758Z",
     "shell.execute_reply": "2021-06-04T02:32:26.519301Z"
    }
   },
   "outputs": [],
   "source": [
    "# currently no longer working\n",
    "from rpy2.robjects.lib.dplyr import (filter,\n",
    "                                     mutate,\n",
    "                                     group_by,\n",
    "                                     summarize)\n",
    "\n",
    "if False:\n",
    "    dataf = (DataFrame(mtcars) >>\n",
    "             filter(rl('gear>3')) >>\n",
    "             mutate(powertoweight=rl('hp*36/wt')) >>\n",
    "             group_by(rl('gear')) >>\n",
    "             summarize(mean_ptw=rl('mean(powertoweight)')))\n",
    "\n",
    "    dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "476a688f",
   "metadata": {},
   "source": [
    "The function `rl` creates unevaluated R language objects, which\n",
    "are then consummed by the `dplyr` function, just like it would be\n",
    "happening when using `dplyr` in R itself. This means that\n",
    "when writing `mean(powertoweight)` the R function `mean()` is used.\n",
    "\n",
    "Using a Python function is not too difficult though. We can just\n",
    "call Python back from R. To achieve this we simply\n",
    "use the decorator `rternalize`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "33ffa6f0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:26.524669Z",
     "iopub.status.busy": "2021-06-04T02:32:26.524091Z",
     "iopub.status.idle": "2021-06-04T02:32:26.569879Z",
     "shell.execute_reply": "2021-06-04T02:32:26.570276Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<emph>DataFrame</emph> with 2 rows and\n",
       "  3 columns:\n",
       "<table class=\"docutils\">\n",
       "  <thead>\n",
       "    <tr class=\"rpy2_names\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>gear</th>\n",
       "      <th>mean_ptw</th>\n",
       "      <th>mean_np_ptw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">0</td>\n",
       "        <td class=\"rpy2_names\">1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">1</td>\n",
       "        <td class=\"rpy2_names\">2</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<rpy2.robjects.lib.dplyr.DataFrame object at 0x7fd307847900> [RTYPES.VECSXP]\n",
       "R classes: ('tbl_df', 'tbl', 'data.frame')\n",
       "[FloatSexpVector, FloatSexpVector, FloatSexpVector]\n",
       "  gear: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7fd30bf9a280> [RTYPES.REALSXP]\n",
       "  mean_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7fd307847580> [RTYPES.REALSXP]\n",
       "  mean_np_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7fd3078f18c0> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Define a python function, and make\n",
    "# it a function R can use through `rternalize`\n",
    "from rpy2.rinterface import rternalize\n",
    "@rternalize\n",
    "def mean_np(x):\n",
    "    import statistics\n",
    "    return statistics.mean(x)\n",
    "\n",
    "# Bind that function to a symbol in R's\n",
    "# global environment\n",
    "from rpy2.robjects import globalenv\n",
    "globalenv['mean_np'] = mean_np\n",
    "\n",
    "# Write a dplyr chain of operations,\n",
    "# using our Python function `mean_np`\n",
    "dataf = (\n",
    "    DataFrame(mtcars)\n",
    "    .filter(rl('gear>3'))\n",
    "    .mutate(powertoweight=rl('hp*36/wt'))\n",
    "    .group_by(rl('gear'))\n",
    "    .summarize(mean_ptw=rl('mean(powertoweight)'),\n",
    "               mean_np_ptw=rl('mean_np(powertoweight)'))\n",
    ")\n",
    "\n",
    "dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60ee6cce",
   "metadata": {},
   "source": [
    "It is also possible to carry this out without having to\n",
    "place the custom function in R's global environment, although\n",
    "this is not straightforward."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "81da10ff",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:26.574632Z",
     "iopub.status.busy": "2021-06-04T02:32:26.574152Z",
     "iopub.status.idle": "2021-06-04T02:32:26.587960Z",
     "shell.execute_reply": "2021-06-04T02:32:26.588297Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<emph>DataFrame</emph> with 2 rows and\n",
       "  3 columns:\n",
       "<table class=\"docutils\">\n",
       "  <thead>\n",
       "    <tr class=\"rpy2_names\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>gear</th>\n",
       "      <th>mean_ptw</th>\n",
       "      <th>mean_np_ptw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">0</td>\n",
       "        <td class=\"rpy2_names\">1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">1</td>\n",
       "        <td class=\"rpy2_names\">2</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<rpy2.robjects.lib.dplyr.DataFrame object at 0x7fd30bf9a4c0> [RTYPES.VECSXP]\n",
       "R classes: ('tbl_df', 'tbl', 'data.frame')\n",
       "[FloatSexpVector, FloatSexpVector, FloatSexpVector]\n",
       "  gear: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7fd3078dec80> [RTYPES.REALSXP]\n",
       "  mean_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7fd3079e9f80> [RTYPES.REALSXP]\n",
       "  mean_np_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7fd3079e9d80> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# First we delete our Python callback in globalenv to\n",
    "# ensure that we are picking up our callback in our\n",
    "# specific environment rather than this one.\n",
    "del(globalenv['mean_np'])\n",
    "\n",
    "from rpy2.robjects import Environment\n",
    "my_env = Environment()\n",
    "my_env['mean_np'] = mean_np\n",
    "\n",
    "\n",
    "# Create an rlang \"quosure\" object within\n",
    "# a given environment. We use the R package\n",
    "# rlang used by dplyr.\n",
    "from rpy2.robjects.lib.dplyr import rlang\n",
    "\n",
    "myquo = rlang.quo.rcall(\n",
    "    [(None, rl('mean_np(rlang::enexpr(powertoweight))'))],\n",
    "    environment=my_env\n",
    ")\n",
    "\n",
    "dataf = (\n",
    "    DataFrame(mtcars)\n",
    "    .filter(rl('gear>3'))\n",
    "    .mutate(powertoweight=rl('hp*36/wt'))\n",
    "    .group_by(rl('gear'))\n",
    "    .summarize(\n",
    "        mean_ptw=rl('mean(powertoweight)'),\n",
    "        mean_np_ptw=myquo)\n",
    ")\n",
    "\n",
    "dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3331a4df",
   "metadata": {},
   "source": [
    "**note**: rpy2's interface to dplyr is implementing a fix to the (non-?)issue 1323\n",
    "(https://github.com/hadley/dplyr/issues/1323)\n",
    "\n",
    "The seamless translation of transformations to SQL whenever the\n",
    "data are in a table can be used directly. Since we are lifting\n",
    "the original implementation of `dplyr`, it *just works*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "abbe6f19",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:26.593309Z",
     "iopub.status.busy": "2021-06-04T02:32:26.592741Z",
     "iopub.status.idle": "2021-06-04T02:32:27.069644Z",
     "shell.execute_reply": "2021-06-04T02:32:27.069180Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# Source:   lazy query [?? x 2]\n",
      "# Database: sqlite 3.35.5 [/tmp/tmpts8edonp]\n",
      "   gear mean_ptw\n",
      "  <dbl>    <dbl>\n",
      "1     4    1237.\n",
      "2     5    2574.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from rpy2.robjects.lib.dplyr import dplyr\n",
    "# in-memory SQLite database broken in dplyr's src_sqlite\n",
    "# db = dplyr.src_sqlite(\":memory:\")\n",
    "import tempfile\n",
    "with tempfile.NamedTemporaryFile() as db_fh:\n",
    "    db = dplyr.src_sqlite(db_fh.name)\n",
    "    # copy the table to that database\n",
    "    dataf_db = DataFrame(mtcars).copy_to(db, name=\"mtcars\")\n",
    "    res = (dataf_db\n",
    "           .filter(rl('gear>3'))\n",
    "           .mutate(powertoweight=rl('hp*36/wt'))\n",
    "           .group_by(rl('gear'))\n",
    "           .summarize(mean_ptw=rl('mean(powertoweight)')))\n",
    "    print(res)\n",
    "# "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88143d26",
   "metadata": {},
   "source": [
    "Since we are manipulating R objects, anything available to R is also available\n",
    "to us. If we want to see the SQL code generated that's:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "771c05b7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:27.073748Z",
     "iopub.status.busy": "2021-06-04T02:32:27.073189Z",
     "iopub.status.idle": "2021-06-04T02:32:27.127949Z",
     "shell.execute_reply": "2021-06-04T02:32:27.127616Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<SQL>\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SELECT `gear`, AVG(`powertoweight`) AS `mean_ptw`\n",
      "FROM (SELECT `mpg`, `cyl`, `disp`, `hp`, `drat`, `wt`, `qsec`, `vs`, `am`, `gear`, `carb`, `hp` * 36.0 / `wt` AS `powertoweight`\n",
      "FROM `mtcars`\n",
      "WHERE (`gear` > 3.0))\n",
      "GROUP BY `gear`\n"
     ]
    }
   ],
   "source": [
    "silent = dplyr.show_query(res)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4799cc94",
   "metadata": {},
   "source": [
    "The conversion rules in rpy2 make the above easily applicable to pandas data frames,\n",
    "completing the \"lexical loan\" of the dplyr vocabulary from R."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6b885ec4",
   "metadata": {
    "attributes": {
     "classes": [
      "python "
     ],
     "id": ""
    },
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:27.131720Z",
     "iopub.status.busy": "2021-06-04T02:32:27.131270Z",
     "iopub.status.idle": "2021-06-04T02:32:27.137004Z",
     "shell.execute_reply": "2021-06-04T02:32:27.136519Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n"
     ]
    }
   ],
   "source": [
    "from rpy2.robjects import pandas2ri\n",
    "from rpy2.robjects import default_converter\n",
    "from rpy2.robjects.conversion import localconverter\n",
    "\n",
    "# Using a conversion context in which the pandas conversion is\n",
    "# added to the default conversion rules, the rpy2 object\n",
    "# `mtcars` (an R data frame) is converted to a pandas data frame.\n",
    "with localconverter(default_converter + pandas2ri.converter) as cv:\n",
    "    pd_mtcars = mtcars_env['mtcars']\n",
    "print(type(pd_mtcars))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f4271ac",
   "metadata": {},
   "source": [
    "Using a local converter lets us also go from the pandas data frame\n",
    "to our dplyr-augmented R data frame and use the dplyr transformations\n",
    "on it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f2e143a9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-04T02:32:27.141475Z",
     "iopub.status.busy": "2021-06-04T02:32:27.141058Z",
     "iopub.status.idle": "2021-06-04T02:32:27.183087Z",
     "shell.execute_reply": "2021-06-04T02:32:27.183630Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<emph>DataFrame</emph> with 1 rows and\n",
       "  1 columns:\n",
       "<table class=\"docutils\">\n",
       "  <thead>\n",
       "    <tr class=\"rpy2_names\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>mean_ptw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">0</td>\n",
       "        <td class=\"rpy2_names\">1</td>\n",
       "      <td>1632.0477884748632</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<rpy2.robjects.lib.dplyr.DataFrame object at 0x7fd3079e9a80> [RTYPES.VECSXP]\n",
       "R classes: ('data.frame',)\n",
       "[FloatSexpVector]\n",
       "  mean_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7fd3078eac00> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with localconverter(default_converter + pandas2ri.converter) as cv:\n",
    "    dataf = (DataFrame(pd_mtcars)\n",
    "             .filter(rl('gear>=3'))\n",
    "             .mutate(powertoweight=rl('hp*36/wt'))\n",
    "             .group_by(rl('gear'))\n",
    "             .summarize(mean_ptw=rl('mean(powertoweight)')))\n",
    "\n",
    "dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb978d81",
   "metadata": {},
   "source": [
    "**Reuse. Get things done. Don't reimplement.**"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}