Commit 103de8ea authored by Andrea Dotti's avatar Andrea Dotti

Add pandas slides

Few additional slides on pandas dataframes
parent 90fca0d3
......@@ -1921,8 +1921,2286 @@
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"## Pandas\n",
"Pandas is a high-performance, high-level library that provides tools for data analysis. \n",
"It relies on the concept of DataFrame: a structured collection of data organized in records. This is the same concept of ROOT's `NTuple` that you are familiar with. \n",
"I think the name comes from R.\n",
"<img src=\"dataframe.jpg\" style=\"width:50%\">"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:22:59.271351Z",
"start_time": "2019-05-22T05:22:58.329324Z"
},
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 2.0\n",
"2 3.0\n",
"3 NaN\n",
"4 5.0\n",
"dtype: float64"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"s = pd.Series( [1., 2., 3., np.nan, 5. ])\n",
"s"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:30:35.292763Z",
"start_time": "2019-05-22T05:30:35.282802Z"
},
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Col1</th>\n",
" <th>Col2</th>\n",
" <th>Col3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>a</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>b</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" <td>c</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>d</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Col1 Col2 Col3\n",
"0 1.0 a True\n",
"1 2.0 b False\n",
"2 3.0 c True\n",
"3 4.0 d True"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame( \n",
" {\n",
" 'Col1': [1.,2.,3.,4.],\n",
" 'Col2': [\"a\",\"b\",\"c\",\"d\"],\n",
" 'Col3': [True, False, True, True]\n",
" }\n",
")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:30:35.706138Z",
"start_time": "2019-05-22T05:30:35.701816Z"
},
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"Col1 float64\n",
"Col2 object\n",
"Col3 bool\n",
"dtype: object"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:30:36.109721Z",
"start_time": "2019-05-22T05:30:36.106099Z"
},
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Col1', 'Col2', 'Col3'], dtype='object')"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:30:36.496734Z",
"start_time": "2019-05-22T05:30:36.493469Z"
},
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/plain": [
"RangeIndex(start=0, stop=4, step=1)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.index"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"### View data"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:32:42.342884Z",
"start_time": "2019-05-22T05:32:42.333534Z"
},
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9</td>\n",
" <td>8</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3</td>\n",
" <td>16</td>\n",
" <td>a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C\n",
"0 4 1 a\n",
"1 2 2 a\n",
"2 0 4 a\n",
"3 9 8 a\n",
"4 3 16 a"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"df = pd.DataFrame( {'A':np.random.randint(0,10,100), 'B': [2**x for x in np.arange(100)], 'C':\"a\"})\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:32:43.172086Z",
"start_time": "2019-05-22T05:32:43.164644Z"
},
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C\n",
"98 5 0 a\n",
"99 9 0 a"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tail(2)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:32:43.603458Z",
"start_time": "2019-05-22T05:32:43.589088Z"
},
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>100.000000</td>\n",
" <td>1.000000e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>4.420000</td>\n",
" <td>-2.560000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>2.985419</td>\n",
" <td>1.070389e+18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" <td>-9.223372e+18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>2.000000</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>5.000000</td>\n",
" <td>6.144000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>7.000000</td>\n",
" <td>1.717987e+11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>9.000000</td>\n",
" <td>4.611686e+18</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B\n",
"count 100.000000 1.000000e+02\n",
"mean 4.420000 -2.560000e+00\n",
"std 2.985419 1.070389e+18\n",
"min 0.000000 -9.223372e+18\n",
"25% 2.000000 0.000000e+00\n",
"50% 5.000000 6.144000e+03\n",
"75% 7.000000 1.717987e+11\n",
"max 9.000000 4.611686e+18"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"### Select data"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T06:03:05.623182Z",
"start_time": "2019-05-22T06:03:05.611815Z"
},
"slideshow": {
"slide_type": "-"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2019-05-27</th>\n",
" <td>0.944420</td>\n",
" <td>0.075201</td>\n",
" <td>0.167932</td>\n",
" <td>0.017186</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2019-05-28</th>\n",
" <td>0.245307</td>\n",
" <td>0.577804</td>\n",
" <td>0.132167</td>\n",
" <td>0.372844</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2019-05-29</th>\n",
" <td>0.459021</td>\n",
" <td>0.087459</td>\n",
" <td>0.647909</td>\n",
" <td>0.963480</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2019-05-30</th>\n",
" <td>0.244232</td>\n",
" <td>0.261606</td>\n",
" <td>0.109693</td>\n",
" <td>0.494399</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2019-05-31</th>\n",
" <td>0.575183</td>\n",
" <td>0.584652</td>\n",
" <td>0.113913</td>\n",
" <td>0.117457</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2019-06-01</th>\n",
" <td>0.190005</td>\n",
" <td>0.692712</td>\n",
" <td>0.404453</td>\n",
" <td>0.995082</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2019-06-02</th>\n",
" <td>0.931300</td>\n",
" <td>0.489561</td>\n",
" <td>0.193387</td>\n",
" <td>0.327648</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D\n",
"2019-05-27 0.944420 0.075201 0.167932 0.017186\n",
"2019-05-28 0.245307 0.577804 0.132167 0.372844\n",
"2019-05-29 0.459021 0.087459 0.647909 0.963480\n",
"2019-05-30 0.244232 0.261606 0.109693 0.494399\n",
"2019-05-31 0.575183 0.584652 0.113913 0.117457\n",
"2019-06-01 0.190005 0.692712 0.404453 0.995082\n",
"2019-06-02 0.931300 0.489561 0.193387 0.327648"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dates = pd.date_range('20190527',periods=7)\n",
"df = pd.DataFrame( np.random.rand(7,4), index=dates, columns=['A','B','C','D'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:41:36.738799Z",
"start_time": "2019-05-22T05:41:36.733900Z"
},
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/plain": [
"2019-05-27 0.632271\n",
"2019-05-28 0.691208\n",
"2019-05-29 0.603331\n",
"2019-05-30 0.043723\n",
"2019-05-31 0.552101\n",
"2019-06-01 0.330455\n",
"2019-06-02 0.841736\n",
"Freq: D, Name: A, dtype: float64"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['A'] # or df.A"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"ExecuteTime": {
"end_time": "2019-05-22T05:41:37.220332Z",
"start_time": "2019-05-22T05:41:37.212416Z"
},
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",