{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cf4ca99b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:49.170676Z",
     "iopub.status.busy": "2024-03-30T23:54:49.170479Z",
     "iopub.status.idle": "2024-03-30T23:54:50.004998Z",
     "shell.execute_reply": "2024-03-30T23:54:50.004402Z"
    }
   },
   "outputs": [],
   "source": [
    "from functools import partial\n",
    "from rpy2.ipython import html\n",
    "html.html_rdataframe=partial(html.html_rdataframe, table_class=\"docutils\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cada70d9",
   "metadata": {},
   "source": [
    "# `R` and `pandas` data frames\n",
    "\n",
    "R `data.frame` and :class:`pandas.DataFrame` objects share a lot of\n",
    "conceptual similarities, and :mod:`pandas` chose to use the class name\n",
    "`DataFrame` after R objects.\n",
    "\n",
    "In a nutshell, both are sequences of vectors (or arrays) of consistent\n",
    "length or size for the first dimension (the \"number of rows\").\n",
    "if coming from the database world, an other way to look at them is\n",
    "column-oriented data tables, or data table API.\n",
    "\n",
    "rpy2 is providing an interface between Python and R, and a convenience\n",
    "conversion layer between :class:`rpy2.robjects.vectors.DataFrame` and\n",
    ":class:`pandas.DataFrame` objects, implemented in\n",
    ":mod:`rpy2.robjects.pandas2ri`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4802c53e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.008088Z",
     "iopub.status.busy": "2024-03-30T23:54:50.007632Z",
     "iopub.status.idle": "2024-03-30T23:54:50.010846Z",
     "shell.execute_reply": "2024-03-30T23:54:50.010255Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import rpy2.robjects as ro\n",
    "from rpy2.robjects.packages import importr \n",
    "from rpy2.robjects import pandas2ri"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6bcb182a",
   "metadata": {},
   "source": [
    "## From `pandas` to `R`\n",
    "\n",
    "Pandas data frame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b22bd528",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.013204Z",
     "iopub.status.busy": "2024-03-30T23:54:50.013003Z",
     "iopub.status.idle": "2024-03-30T23:54:50.022804Z",
     "shell.execute_reply": "2024-03-30T23:54:50.022165Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>int_values</th>\n",
       "      <th>str_values</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>abc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>def</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>ghi</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   int_values str_values\n",
       "0           1        abc\n",
       "1           2        def\n",
       "2           3        ghi"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd_df = pd.DataFrame({'int_values': [1,2,3],\n",
    "                      'str_values': ['abc', 'def', 'ghi']})\n",
    "\n",
    "pd_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "99546d14",
   "metadata": {},
   "source": [
    "R data frame converted from a `pandas` data frame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5de0533f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.025258Z",
     "iopub.status.busy": "2024-03-30T23:54:50.024869Z",
     "iopub.status.idle": "2024-03-30T23:54:50.033275Z",
     "shell.execute_reply": "2024-03-30T23:54:50.032700Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <span>R/rpy2 DataFrame (3 x 2)</span>\n",
       "        <table>\n",
       "          <thead>\n",
       "            <tr>\n",
       "              \n",
       "              <th>int_values</th>\n",
       "              \n",
       "              <th>str_values</th>\n",
       "              \n",
       "            </tr>\n",
       "          </thead>\n",
       "          <tbody>\n",
       "          \n",
       "          <tr>\n",
       "            \n",
       "            <td>\n",
       "              ...\n",
       "            </td>\n",
       "            \n",
       "            <td>\n",
       "              ...\n",
       "            </td>\n",
       "            \n",
       "          </tr>\n",
       "          \n",
       "          </tbody>\n",
       "        </table>\n",
       "    "
      ],
      "text/plain": [
       "<rpy2.robjects.vectors.DataFrame object at 0x7f0472afcc40> [RTYPES.VECSXP]\n",
       "R classes: ('data.frame',)\n",
       "[IntSexpVector, StrSexpVector]\n",
       "  int_values: <class 'rpy2.rinterface.IntSexpVector'>\n",
       "  <rpy2.rinterface.IntSexpVector object at 0x7f0472b04600> [RTYPES.INTSXP]\n",
       "  str_values: <class 'rpy2.rinterface_lib.sexp.StrSexpVector'>\n",
       "  <rpy2.rinterface_lib.sexp.StrSexpVector object at 0x7f04c868d3c0> [RTYPES.STRSXP]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with (ro.default_converter + pandas2ri.converter).context():\n",
    "  r_from_pd_df = ro.conversion.get_conversion().py2rpy(pd_df)\n",
    "\n",
    "r_from_pd_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4972228",
   "metadata": {},
   "source": [
    "The conversion is automatically happening when calling R functions.\n",
    "For example, when calling the R function `base::summary`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b7cc1f8b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.035825Z",
     "iopub.status.busy": "2024-03-30T23:54:50.035358Z",
     "iopub.status.idle": "2024-03-30T23:54:50.562034Z",
     "shell.execute_reply": "2024-03-30T23:54:50.561343Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   int_values   str_values       \n",
      " Min.   :1.0   Length:3          \n",
      " 1st Qu.:1.5   Class :character  \n",
      " Median :2.0   Mode  :character  \n",
      " Mean   :2.0                     \n",
      " 3rd Qu.:2.5                     \n",
      " Max.   :3.0                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "base = importr('base')\n",
    "\n",
    "with (ro.default_converter + pandas2ri.converter).context():\n",
    "  df_summary = base.summary(pd_df)\n",
    "print(df_summary)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9fee2eae",
   "metadata": {},
   "source": [
    "Note that a `ContextManager` is used to limit the scope of the\n",
    "conversion. Without it, rpy2 will not know how to convert a pandas\n",
    "data frame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3a482704",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.564698Z",
     "iopub.status.busy": "2024-03-30T23:54:50.564191Z",
     "iopub.status.idle": "2024-03-30T23:54:50.568647Z",
     "shell.execute_reply": "2024-03-30T23:54:50.567967Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NotImplementedError:\n",
      "Conversion 'py2rpy' not defined for objects of type '<class 'pandas.core.frame.DataFrame'>'\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "  df_summary = base.summary(pd_df)\n",
    "except NotImplementedError as nie:\n",
    "  print('NotImplementedError:')\n",
    "  print(nie)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d39c09c4",
   "metadata": {},
   "source": [
    "## From `R` to `pandas`\n",
    "\n",
    "Starting from an R data frame this time:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d7f48405",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.571038Z",
     "iopub.status.busy": "2024-03-30T23:54:50.570842Z",
     "iopub.status.idle": "2024-03-30T23:54:50.577068Z",
     "shell.execute_reply": "2024-03-30T23:54:50.576428Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <span>R/rpy2 DataFrame (3 x 2)</span>\n",
       "        <table>\n",
       "          <thead>\n",
       "            <tr>\n",
       "              \n",
       "              <th>int_values</th>\n",
       "              \n",
       "              <th>str_values</th>\n",
       "              \n",
       "            </tr>\n",
       "          </thead>\n",
       "          <tbody>\n",
       "          \n",
       "          <tr>\n",
       "            \n",
       "            <td>\n",
       "              ...\n",
       "            </td>\n",
       "            \n",
       "            <td>\n",
       "              ...\n",
       "            </td>\n",
       "            \n",
       "          </tr>\n",
       "          \n",
       "          </tbody>\n",
       "        </table>\n",
       "    "
      ],
      "text/plain": [
       "<rpy2.robjects.vectors.DataFrame object at 0x7f047299e140> [RTYPES.VECSXP]\n",
       "R classes: ('data.frame',)\n",
       "[IntSexpVector, StrSexpVector]\n",
       "  int_values: <class 'rpy2.rinterface.IntSexpVector'>\n",
       "  <rpy2.rinterface.IntSexpVector object at 0x7f04729a04c0> [RTYPES.INTSXP]\n",
       "  str_values: <class 'rpy2.rinterface_lib.sexp.StrSexpVector'>\n",
       "  <rpy2.rinterface_lib.sexp.StrSexpVector object at 0x7f047298ca80> [RTYPES.STRSXP]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r_df = ro.DataFrame({'int_values': ro.IntVector([1,2,3]),\n",
    "                     'str_values': ro.StrVector(['abc', 'def', 'ghi'])})\n",
    "\n",
    "r_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de3f6fda",
   "metadata": {},
   "source": [
    "It can be converted to a pandas data frame using the same converter:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "50ead97f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.579395Z",
     "iopub.status.busy": "2024-03-30T23:54:50.579007Z",
     "iopub.status.idle": "2024-03-30T23:54:50.587152Z",
     "shell.execute_reply": "2024-03-30T23:54:50.586572Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>int_values</th>\n",
       "      <th>str_values</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>abc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>def</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>ghi</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   int_values str_values\n",
       "1           1        abc\n",
       "2           2        def\n",
       "3           3        ghi"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with (ro.default_converter + pandas2ri.converter).context():\n",
    "  pd_from_r_df = ro.conversion.get_conversion().rpy2py(r_df)\n",
    "\n",
    "pd_from_r_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b71d58a",
   "metadata": {},
   "source": [
    "## Date and time objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e17cf2de",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.589511Z",
     "iopub.status.busy": "2024-03-30T23:54:50.589127Z",
     "iopub.status.idle": "2024-03-30T23:54:50.595679Z",
     "shell.execute_reply": "2024-03-30T23:54:50.595033Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2017-01-01 00:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2017-01-01 00:00:01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2017-01-01 00:00:02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2017-01-01 00:00:03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2017-01-01 00:00:04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2017-01-01 00:00:05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2017-01-01 00:00:06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2017-01-01 00:00:07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2017-01-01 00:00:08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2017-01-01 00:00:09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Timestamp\n",
       "0 2017-01-01 00:00:00\n",
       "1 2017-01-01 00:00:01\n",
       "2 2017-01-01 00:00:02\n",
       "3 2017-01-01 00:00:03\n",
       "4 2017-01-01 00:00:04\n",
       "5 2017-01-01 00:00:05\n",
       "6 2017-01-01 00:00:06\n",
       "7 2017-01-01 00:00:07\n",
       "8 2017-01-01 00:00:08\n",
       "9 2017-01-01 00:00:09"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd_df = pd.DataFrame({\n",
    "    'Timestamp': pd.date_range('2017-01-01 00:00:00', periods=10, freq='s')\n",
    "    })\n",
    "    \n",
    "pd_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "5180cb77",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.598024Z",
     "iopub.status.busy": "2024-03-30T23:54:50.597657Z",
     "iopub.status.idle": "2024-03-30T23:54:50.604998Z",
     "shell.execute_reply": "2024-03-30T23:54:50.604365Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <span>R/rpy2 DataFrame (10 x 1)</span>\n",
       "        <table>\n",
       "          <thead>\n",
       "            <tr>\n",
       "              \n",
       "              <th>Timestamp</th>\n",
       "              \n",
       "            </tr>\n",
       "          </thead>\n",
       "          <tbody>\n",
       "          \n",
       "          <tr>\n",
       "            \n",
       "            <td>\n",
       "              ...\n",
       "            </td>\n",
       "            \n",
       "          </tr>\n",
       "          \n",
       "          </tbody>\n",
       "        </table>\n",
       "    "
      ],
      "text/plain": [
       "<rpy2.robjects.vectors.DataFrame object at 0x7f04729a1180> [RTYPES.VECSXP]\n",
       "R classes: ('data.frame',)\n",
       "[FloatSexpVector]\n",
       "  Timestamp: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f0472ae7bc0> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with (ro.default_converter + pandas2ri.converter).context():\n",
    "  r_from_pd_df = ro.conversion.py2rpy(pd_df)\n",
    "\n",
    "r_from_pd_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "282a5fbd",
   "metadata": {},
   "source": [
    "The timezone used for conversion is the system's default timezone unless\n",
    "`rpy2.robjects.vectors.default_timezone` is specified...\n",
    "or unless the time zone is specified in the original time object:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d74e4b23",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-03-30T23:54:50.607247Z",
     "iopub.status.busy": "2024-03-30T23:54:50.607053Z",
     "iopub.status.idle": "2024-03-30T23:54:50.614493Z",
     "shell.execute_reply": "2024-03-30T23:54:50.613862Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <span>R/rpy2 DataFrame (10 x 1)</span>\n",
       "        <table>\n",
       "          <thead>\n",
       "            <tr>\n",
       "              \n",
       "              <th>Timestamp</th>\n",
       "              \n",
       "            </tr>\n",
       "          </thead>\n",
       "          <tbody>\n",
       "          \n",
       "          <tr>\n",
       "            \n",
       "            <td>\n",
       "              ...\n",
       "            </td>\n",
       "            \n",
       "          </tr>\n",
       "          \n",
       "          </tbody>\n",
       "        </table>\n",
       "    "
      ],
      "text/plain": [
       "<rpy2.robjects.vectors.DataFrame object at 0x7f04729854c0> [RTYPES.VECSXP]\n",
       "R classes: ('data.frame',)\n",
       "[FloatSexpVector]\n",
       "  Timestamp: <class 'rpy2.rinterface.FloatSexpVector'>\n",
       "  <rpy2.rinterface.FloatSexpVector object at 0x7f04728fff00> [RTYPES.REALSXP]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd_tz_df = pd.DataFrame({\n",
    "    'Timestamp': pd.date_range('2017-01-01 00:00:00', periods=10, freq='s',\n",
    "                               tz='UTC')\n",
    "    })\n",
    "    \n",
    "with (ro.default_converter + pandas2ri.converter).context():\n",
    "  r_from_pd_tz_df = ro.conversion.py2rpy(pd_tz_df)\n",
    "\n",
    "r_from_pd_tz_df"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}