{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e08e75d6",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:27.739977Z",
     "iopub.status.busy": "2024-03-30T23:54:27.739784Z",
     "iopub.status.idle": "2024-03-30T23:54:28.613506Z",
     "shell.execute_reply": "2024-03-30T23:54:28.612770Z"
    }
   },
   "outputs": [],
   "source": [
    "from functools import partial\n",
    "from rpy2.ipython import html\n",
    "html.html_rdataframe=partial(html.html_rdataframe, table_class=\"docutils\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ffedfd5",
   "metadata": {},
   "source": [
    "# dplyr in Python\n",
    "\n",
    "We need 2 things for this:\n",
    "\n",
    "1- A data frame (using one of R's demo datasets)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9c4e0aa6",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:28.616675Z",
     "iopub.status.busy": "2024-03-30T23:54:28.616179Z",
     "iopub.status.idle": "2024-03-30T23:54:28.647601Z",
     "shell.execute_reply": "2024-03-30T23:54:28.646965Z"
    }
   },
   "outputs": [],
   "source": [
    "from rpy2.robjects.packages import importr, data\n",
    "datasets = importr('datasets')\n",
    "mtcars_env = data(datasets).fetch('mtcars')\n",
    "mtcars = mtcars_env['mtcars']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c929ed46",
   "metadata": {},
   "source": [
    "In addition to that, and because this tutorial is in a notebook,\n",
    "we initialize HTML rendering for R objects (pretty display of\n",
    "R data frames)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e6adcd15",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:28.650195Z",
     "iopub.status.busy": "2024-03-30T23:54:28.649826Z",
     "iopub.status.idle": "2024-03-30T23:54:28.654275Z",
     "shell.execute_reply": "2024-03-30T23:54:28.653650Z"
    }
   },
   "outputs": [],
   "source": [
    "import rpy2.ipython.html\n",
    "rpy2.ipython.html.init_printing()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f1c69f13",
   "metadata": {},
   "source": [
    "2- dplyr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f59fa61c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:28.656803Z",
     "iopub.status.busy": "2024-03-30T23:54:28.656432Z",
     "iopub.status.idle": "2024-03-30T23:54:30.203123Z",
     "shell.execute_reply": "2024-03-30T23:54:30.202381Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/rpy2/robjects/lib/dplyr.py:27: UserWarning: This was designed against dplyr versions starting with 1.0 but you have 1.1.4\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from rpy2.robjects.lib.dplyr import DataFrame\n",
    "from rpy2.robjects import rl"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60f72946",
   "metadata": {},
   "source": [
    "With this we have the choice of chaining (D3-style)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4bc6a027",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:30.205739Z",
     "iopub.status.busy": "2024-03-30T23:54:30.205338Z",
     "iopub.status.idle": "2024-03-30T23:54:30.223063Z",
     "shell.execute_reply": "2024-03-30T23:54:30.222427Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<emph>DataFrame</emph> with 2 rows and\n",
       "  2 columns:\n",
       "<table class=\"docutils\">\n",
       "  <thead>\n",
       "    <tr class=\"rpy2_names\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>gear</th>\n",
       "      <th>mean_ptw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">0</td>\n",
       "        <td class=\"rpy2_names\">1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">1</td>\n",
       "        <td class=\"rpy2_names\">2</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<rpy2.robjects.lib.dplyr.DataFrame object at 0x7f190c11fb40> [RTYPES.VECSXP]\n",
       "R classes: ('tbl_df', 'tbl', 'data.frame')\n",
       "[FloatSexpVector, FloatSexpVector]\n",
       "  gear: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f190c223340> [RTYPES.REALSXP]\n",
       "  mean_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f190c1900c0> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataf = (\n",
    "    DataFrame(mtcars)\n",
    "    .filter(rl('gear>3'))\n",
    "    .mutate(powertoweight=rl('hp*36/wt'))\n",
    "    .group_by(rl('gear'))\n",
    "    .summarize(mean_ptw=rl('mean(powertoweight)'))\n",
    ")\n",
    "\n",
    "dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7fcb835",
   "metadata": {},
   "source": [
    "or with pipes (magrittr style)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2ad48720",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:30.225494Z",
     "iopub.status.busy": "2024-03-30T23:54:30.225291Z",
     "iopub.status.idle": "2024-03-30T23:54:30.228999Z",
     "shell.execute_reply": "2024-03-30T23:54:30.228388Z"
    }
   },
   "outputs": [],
   "source": [
    "# currently no longer working\n",
    "from rpy2.robjects.lib.dplyr import (filter,\n",
    "                                     mutate,\n",
    "                                     group_by,\n",
    "                                     summarize)\n",
    "\n",
    "if False:\n",
    "    dataf = (DataFrame(mtcars) >>\n",
    "             filter(rl('gear>3')) >>\n",
    "             mutate(powertoweight=rl('hp*36/wt')) >>\n",
    "             group_by(rl('gear')) >>\n",
    "             summarize(mean_ptw=rl('mean(powertoweight)')))\n",
    "\n",
    "    dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e5dc2a5c",
   "metadata": {},
   "source": [
    "The function `rl` creates unevaluated R language objects, which\n",
    "are then consummed by the `dplyr` function, just like it would be\n",
    "happening when using `dplyr` in R itself. This means that\n",
    "when writing `mean(powertoweight)` the R function `mean()` is used.\n",
    "\n",
    "Using a Python function is not too difficult though. We can just\n",
    "call Python back from R. To achieve this we simply\n",
    "use the decorator `rternalize`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7f3a1627",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:30.231220Z",
     "iopub.status.busy": "2024-03-30T23:54:30.231022Z",
     "iopub.status.idle": "2024-03-30T23:54:30.273115Z",
     "shell.execute_reply": "2024-03-30T23:54:30.272453Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<emph>DataFrame</emph> with 2 rows and\n",
       "  3 columns:\n",
       "<table class=\"docutils\">\n",
       "  <thead>\n",
       "    <tr class=\"rpy2_names\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>gear</th>\n",
       "      <th>mean_ptw</th>\n",
       "      <th>mean_np_ptw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">0</td>\n",
       "        <td class=\"rpy2_names\">1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">1</td>\n",
       "        <td class=\"rpy2_names\">2</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<rpy2.robjects.lib.dplyr.DataFrame object at 0x7f190c113900> [RTYPES.VECSXP]\n",
       "R classes: ('tbl_df', 'tbl', 'data.frame')\n",
       "[FloatSexpVector, FloatSexpVector, FloatSexpVector]\n",
       "  gear: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f190c193580> [RTYPES.REALSXP]\n",
       "  mean_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f196b85cfc0> [RTYPES.REALSXP]\n",
       "  mean_np_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f190c193580> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Define a python function, and make\n",
    "# it a function R can use through `rternalize`\n",
    "from rpy2.rinterface import rternalize\n",
    "@rternalize(signature=False)\n",
    "def mean_np(x):\n",
    "    import statistics\n",
    "    return statistics.mean(x)\n",
    "\n",
    "# Bind that function to a symbol in R's\n",
    "# global environment\n",
    "from rpy2.robjects import globalenv\n",
    "globalenv['mean_np'] = mean_np\n",
    "\n",
    "# Write a dplyr chain of operations,\n",
    "# using our Python function `mean_np`\n",
    "dataf = (\n",
    "    DataFrame(mtcars)\n",
    "    .filter(rl('gear>3'))\n",
    "    .mutate(powertoweight=rl('hp*36/wt'))\n",
    "    .group_by(rl('gear'))\n",
    "    .summarize(mean_ptw=rl('mean(powertoweight)'),\n",
    "               mean_np_ptw=rl('mean_np(powertoweight)'))\n",
    ")\n",
    "\n",
    "dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18210706",
   "metadata": {},
   "source": [
    "It is also possible to carry this out without having to\n",
    "place the custom function in R's global environment, although\n",
    "this is not straightforward."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f1dd1bb1",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:30.275626Z",
     "iopub.status.busy": "2024-03-30T23:54:30.275130Z",
     "iopub.status.idle": "2024-03-30T23:54:30.289228Z",
     "shell.execute_reply": "2024-03-30T23:54:30.288589Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<emph>DataFrame</emph> with 2 rows and\n",
       "  3 columns:\n",
       "<table class=\"docutils\">\n",
       "  <thead>\n",
       "    <tr class=\"rpy2_names\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>gear</th>\n",
       "      <th>mean_ptw</th>\n",
       "      <th>mean_np_ptw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">0</td>\n",
       "        <td class=\"rpy2_names\">1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "      <td>1237.1266499803169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">1</td>\n",
       "        <td class=\"rpy2_names\">2</td>\n",
       "      <td>5.0</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "      <td>2574.0331639315027</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<rpy2.robjects.lib.dplyr.DataFrame object at 0x7f190c3669c0> [RTYPES.VECSXP]\n",
       "R classes: ('tbl_df', 'tbl', 'data.frame')\n",
       "[FloatSexpVector, FloatSexpVector, FloatSexpVector]\n",
       "  gear: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f19116671c0> [RTYPES.REALSXP]\n",
       "  mean_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f1911667900> [RTYPES.REALSXP]\n",
       "  mean_np_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f19116671c0> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# First we delete our Python callback in globalenv to\n",
    "# ensure that we are picking up our callback in our\n",
    "# specific environment rather than this one.\n",
    "del(globalenv['mean_np'])\n",
    "\n",
    "from rpy2.robjects import Environment\n",
    "my_env = Environment()\n",
    "my_env['mean_np'] = mean_np\n",
    "\n",
    "\n",
    "# Create an rlang \"quosure\" object within\n",
    "# a given environment. We use the R package\n",
    "# rlang used by dplyr.\n",
    "from rpy2.robjects.lib.dplyr import rlang\n",
    "\n",
    "myquo = rlang.quo.rcall(\n",
    "    [(None, rl('mean_np(rlang::enexpr(powertoweight))'))],\n",
    "    environment=my_env\n",
    ")\n",
    "\n",
    "dataf = (\n",
    "    DataFrame(mtcars)\n",
    "    .filter(rl('gear>3'))\n",
    "    .mutate(powertoweight=rl('hp*36/wt'))\n",
    "    .group_by(rl('gear'))\n",
    "    .summarize(\n",
    "        mean_ptw=rl('mean(powertoweight)'),\n",
    "        mean_np_ptw=myquo)\n",
    ")\n",
    "\n",
    "dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c548762e",
   "metadata": {},
   "source": [
    "**note**: rpy2's interface to dplyr is implementing a fix to the (non-?)issue 1323\n",
    "(https://github.com/hadley/dplyr/issues/1323)\n",
    "\n",
    "The seamless translation of transformations to SQL whenever the\n",
    "data are in a table can be used directly. Since we are lifting\n",
    "the original implementation of `dplyr`, it *just works*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "4615cec7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:30.291673Z",
     "iopub.status.busy": "2024-03-30T23:54:30.291286Z",
     "iopub.status.idle": "2024-03-30T23:54:30.845698Z",
     "shell.execute_reply": "2024-03-30T23:54:30.845065Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# Source:   SQL [2 x 2]\n",
      "# Database: sqlite 3.45.0 [/tmp/tmpcojvog4z]\n",
      "   gear mean_ptw\n",
      "  <dbl>    <dbl>\n",
      "1     4    1237.\n",
      "2     5    2574.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from rpy2.robjects.lib.dplyr import dplyr\n",
    "# in-memory SQLite database broken in dplyr's src_sqlite\n",
    "# db = dplyr.src_sqlite(\":memory:\")\n",
    "import tempfile\n",
    "with tempfile.NamedTemporaryFile() as db_fh:\n",
    "    db = dplyr.src_sqlite(db_fh.name)\n",
    "    # copy the table to that database\n",
    "    dataf_db = DataFrame(mtcars).copy_to(db, name=\"mtcars\")\n",
    "    res = (dataf_db\n",
    "           .filter(rl('gear>3'))\n",
    "           .mutate(powertoweight=rl('hp*36/wt'))\n",
    "           .group_by(rl('gear'))\n",
    "           .summarize(mean_ptw=rl('mean(powertoweight)')))\n",
    "    print(res)\n",
    "# "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6dbcfae6",
   "metadata": {},
   "source": [
    "Since we are manipulating R objects, anything available to R is also available\n",
    "to us. If we want to see the SQL code generated that's:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f343f7bd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:30.848384Z",
     "iopub.status.busy": "2024-03-30T23:54:30.847910Z",
     "iopub.status.idle": "2024-03-30T23:54:30.910690Z",
     "shell.execute_reply": "2024-03-30T23:54:30.910015Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<SQL>\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SELECT `gear`, AVG(`powertoweight`) AS `mean_ptw`\n",
      "FROM (\n",
      "  SELECT `mtcars`.*, (`hp` * 36.0) / `wt` AS `powertoweight`\n",
      "  FROM `mtcars`\n",
      "  WHERE (`gear` > 3.0)\n",
      ") AS `q01`\n",
      "GROUP BY `gear`\n"
     ]
    }
   ],
   "source": [
    "silent = dplyr.show_query(res)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ee2104e",
   "metadata": {},
   "source": [
    "The conversion rules in rpy2 make the above easily applicable to pandas data frames,\n",
    "completing the \"lexical loan\" of the dplyr vocabulary from R."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b4ec36f9",
   "metadata": {
    "attributes": {
     "classes": [
      "python "
     ],
     "id": ""
    },
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:30.913106Z",
     "iopub.status.busy": "2024-03-30T23:54:30.912903Z",
     "iopub.status.idle": "2024-03-30T23:54:30.919391Z",
     "shell.execute_reply": "2024-03-30T23:54:30.918836Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n"
     ]
    }
   ],
   "source": [
    "from rpy2.robjects import pandas2ri\n",
    "from rpy2.robjects import default_converter\n",
    "\n",
    "# Using a conversion context in which the pandas conversion is\n",
    "# added to the default conversion rules, the rpy2 object\n",
    "# `mtcars` (an R data frame) is converted to a pandas data frame.\n",
    "with (default_converter + pandas2ri.converter).context() as cv:\n",
    "    pd_mtcars = mtcars_env['mtcars']\n",
    "print(type(pd_mtcars))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b67356a8",
   "metadata": {},
   "source": [
    "Using a local conversion context lets us also go from the pandas data frame\n",
    "to our dplyr-augmented R data frame and use the dplyr transformations\n",
    "on it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1380e294",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:30.921577Z",
     "iopub.status.busy": "2024-03-30T23:54:30.921384Z",
     "iopub.status.idle": "2024-03-30T23:54:30.956557Z",
     "shell.execute_reply": "2024-03-30T23:54:30.955923Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<emph>DataFrame</emph> with 1 rows and\n",
       "  1 columns:\n",
       "<table class=\"docutils\">\n",
       "  <thead>\n",
       "    <tr class=\"rpy2_names\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>mean_ptw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td class=\"rpy2_rowname\">0</td>\n",
       "        <td class=\"rpy2_names\">1</td>\n",
       "      <td>1632.0477884748632</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<rpy2.robjects.lib.dplyr.DataFrame object at 0x7f190c25ae80> [RTYPES.VECSXP]\n",
       "R classes: ('data.frame',)\n",
       "[FloatSexpVector]\n",
       "  mean_ptw: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f19116629c0> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with (default_converter + pandas2ri.converter).context() as cv:\n",
    "    dataf = (DataFrame(pd_mtcars)\n",
    "             .filter(rl('gear>=3'))\n",
    "             .mutate(powertoweight=rl('hp*36/wt'))\n",
    "             .group_by(rl('gear'))\n",
    "             .summarize(mean_ptw=rl('mean(powertoweight)')))\n",
    "\n",
    "dataf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3942f811",
   "metadata": {},
   "source": [
    "**Reuse. Get things done. Don't reimplement.**"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}