In [1]:
from functools import partial
from rpy2.ipython import html
html.html_rdataframe=partial(html.html_rdataframe, table_class="docutils")
/opt/hostedtoolcache/Python/3.10.17/x64/lib/python3.10/site-packages/rpy2/rinterface/__init__.py:1185: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/opt/R/4.5.0/lib/R/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.10.17/x64/lib:/opt/hostedtoolcache/Python/3.10.17/x64/lib", R: "/opt/R/4.5.0/lib/R/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/R/4.5.0/lib/R/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.10.17/x64/lib:/opt/hostedtoolcache/Python/3.10.17/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.10.17/x64/lib/python3.10/site-packages/rpy2/rinterface/__init__.py:1185: UserWarning: Environment variable "PWD" redefined by R and overriding existing variable. Current: "/home/runner/work/rpy2/rpy2/doc", R: "/home/runner/work/rpy2/rpy2/doc/notebooks"
  warnings.warn(
/opt/hostedtoolcache/Python/3.10.17/x64/lib/python3.10/site-packages/rpy2/rinterface/__init__.py:1185: UserWarning: Environment variable "LD_LIBRARY_PATH" redefined by R and overriding existing variable. Current: "/opt/R/4.5.0/lib/R/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/R/4.5.0/lib/R/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.10.17/x64/lib:/opt/hostedtoolcache/Python/3.10.17/x64/lib", R: "/opt/R/4.5.0/lib/R/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/R/4.5.0/lib/R/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/R/4.5.0/lib/R/lib:/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/temurin-17-jdk-amd64/lib/server:/opt/hostedtoolcache/Python/3.10.17/x64/lib:/opt/hostedtoolcache/Python/3.10.17/x64/lib"
  warnings.warn(
/opt/hostedtoolcache/Python/3.10.17/x64/lib/python3.10/site-packages/rpy2/rinterface/__init__.py:1185: UserWarning: Environment variable "R_SESSION_TMPDIR" redefined by R and overriding existing variable. Current: "/tmp/Rtmp2r0D3H", R: "/tmp/RtmpBedMOF"
  warnings.warn(

dplyr in Python¶

We need 2 things for this:

1- A data frame (using one of R's demo datasets).

In [2]:
from rpy2.robjects.packages import importr, data
datasets = importr('datasets')
mtcars_env = data(datasets).fetch('mtcars')
mtcars = mtcars_env['mtcars']

In addition to that, and because this tutorial is in a notebook, we initialize HTML rendering for R objects (pretty display of R data frames).

In [3]:
import rpy2.ipython.html
rpy2.ipython.html.init_printing()

2- dplyr

In [4]:
from rpy2.robjects.lib.dplyr import DataFrame
from rpy2.robjects import rl

With this we have the choice of chaining (D3-style)

In [5]:
dataf = (
    DataFrame(mtcars)
    .filter(rl('gear>3'))
    .mutate(powertoweight=rl('hp*36/wt'))
    .group_by(rl('gear'))
    .summarize(mean_ptw=rl('mean(powertoweight)'))
)

dataf
Out[5]:
DataFrame with 2 rows and 2 columns:
gear mean_ptw
0 1 4.0 1237.1266499803169
1 2 5.0 2574.0331639315027

or with pipes (magrittr style).

In [6]:
# currently no longer working
from rpy2.robjects.lib.dplyr import (filter,
                                     mutate,
                                     group_by,
                                     summarize)

if False:
    dataf = (DataFrame(mtcars) >>
             filter(rl('gear>3')) >>
             mutate(powertoweight=rl('hp*36/wt')) >>
             group_by(rl('gear')) >>
             summarize(mean_ptw=rl('mean(powertoweight)')))

    dataf

The function rl creates unevaluated R language objects, which are then consummed by the dplyr function, just like it would be happening when using dplyr in R itself. This means that when writing mean(powertoweight) the R function mean() is used.

Using a Python function is not too difficult though. We can just call Python back from R. To achieve this we simply use the decorator rternalize.

In [7]:
# Define a python function, and make
# it a function R can use through `rternalize`
from rpy2.rinterface import rternalize
@rternalize(signature=False)
def mean_np(x):
    import statistics
    return statistics.mean(x)

# Bind that function to a symbol in R's
# global environment
from rpy2.robjects import globalenv
globalenv['mean_np'] = mean_np

# Write a dplyr chain of operations,
# using our Python function `mean_np`
dataf = (
    DataFrame(mtcars)
    .filter(rl('gear>3'))
    .mutate(powertoweight=rl('hp*36/wt'))
    .group_by(rl('gear'))
    .summarize(mean_ptw=rl('mean(powertoweight)'),
               mean_np_ptw=rl('mean_np(powertoweight)'))
)

dataf
Out[7]:
DataFrame with 2 rows and 3 columns:
gear mean_ptw mean_np_ptw
0 1 4.0 1237.1266499803169 1237.1266499803169
1 2 5.0 2574.0331639315027 2574.0331639315027

It is also possible to carry this out without having to place the custom function in R's global environment, although this is not straightforward.

In [8]:
# First we delete our Python callback in globalenv to
# ensure that we are picking up our callback in our
# specific environment rather than this one.
del(globalenv['mean_np'])

from rpy2.robjects import Environment
my_env = Environment()
my_env['mean_np'] = mean_np


# Create an rlang "quosure" object within
# a given environment. We use the R package
# rlang used by dplyr.
from rpy2.robjects.lib.dplyr import rlang

myquo = rlang.quo.rcall(
    [(None, rl('mean_np(rlang::enexpr(powertoweight))'))],
    environment=my_env
)

dataf = (
    DataFrame(mtcars)
    .filter(rl('gear>3'))
    .mutate(powertoweight=rl('hp*36/wt'))
    .group_by(rl('gear'))
    .summarize(
        mean_ptw=rl('mean(powertoweight)'),
        mean_np_ptw=myquo)
)

dataf
Out[8]:
DataFrame with 2 rows and 3 columns:
gear mean_ptw mean_np_ptw
0 1 4.0 1237.1266499803169 1237.1266499803169
1 2 5.0 2574.0331639315027 2574.0331639315027

note: rpy2's interface to dplyr is implementing a fix to the (non-?)issue 1323 (https://github.com/hadley/dplyr/issues/1323)

The seamless translation of transformations to SQL whenever the data are in a table can be used directly. Since we are lifting the original implementation of dplyr, it just works.

In [9]:
from rpy2.robjects.lib.dplyr import dplyr
# in-memory SQLite database broken in dplyr's src_sqlite
# db = dplyr.src_sqlite(":memory:")
import tempfile
with tempfile.NamedTemporaryFile() as db_fh:
    db = dplyr.src_sqlite(db_fh.name)
    # copy the table to that database
    dataf_db = DataFrame(mtcars).copy_to(db, name="mtcars")
    res = (dataf_db
           .filter(rl('gear>3'))
           .mutate(powertoweight=rl('hp*36/wt'))
           .group_by(rl('gear'))
           .summarize(mean_ptw=rl('mean(powertoweight)')))
    print(res)
# 
# Source:   SQL [?? x 2]
# Database: sqlite 3.49.1 [/tmp/tmp8qyxtji0]
   gear mean_ptw
  <dbl>    <dbl>
1     4    1237.
2     5    2574.

Since we are manipulating R objects, anything available to R is also available to us. If we want to see the SQL code generated that's:

In [10]:
silent = dplyr.show_query(res)
<SQL>
SELECT `gear`, AVG(`powertoweight`) AS `mean_ptw`
FROM (
  SELECT `mtcars`.*, (`hp` * 36.0) / `wt` AS `powertoweight`
  FROM `mtcars`
  WHERE (`gear` > 3.0)
) AS `q01`
GROUP BY `gear`

The conversion rules in rpy2 make the above easily applicable to pandas data frames, completing the "lexical loan" of the dplyr vocabulary from R.

In [11]:
from rpy2.robjects import pandas2ri
from rpy2.robjects import default_converter

# Using a conversion context in which the pandas conversion is
# added to the default conversion rules, the rpy2 object
# `mtcars` (an R data frame) is converted to a pandas data frame.
with (default_converter + pandas2ri.converter).context() as cv:
    pd_mtcars = mtcars_env['mtcars']
print(type(pd_mtcars))
<class 'pandas.core.frame.DataFrame'>

Using a local conversion context lets us also go from the pandas data frame to our dplyr-augmented R data frame and use the dplyr transformations on it.

In [12]:
with (default_converter + pandas2ri.converter).context() as cv:
    dataf = (DataFrame(pd_mtcars)
             .filter(rl('gear>=3'))
             .mutate(powertoweight=rl('hp*36/wt'))
             .group_by(rl('gear'))
             .summarize(mean_ptw=rl('mean(powertoweight)')))

dataf
Out[12]:
DataFrame with 1 rows and 1 columns:
mean_ptw
0 1 1632.0477884748632

Reuse. Get things done. Don't reimplement.