Python-100-Days/Day76-90/code/8-pandas分组聚合操作.ipynb

878 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# 分组聚合透视\n",
"# 很多时候属性是相似的\n",
"\n",
"import numpy as np\n",
"\n",
"import pandas as pd\n",
"\n",
"from pandas import Series,DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hand</th>\n",
" <th>Smoke</th>\n",
" <th>sex</th>\n",
" <th>weight</th>\n",
" <th>IQ</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>right</td>\n",
" <td>yes</td>\n",
" <td>male</td>\n",
" <td>80</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>left</td>\n",
" <td>yes</td>\n",
" <td>female</td>\n",
" <td>50</td>\n",
" <td>120</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>left</td>\n",
" <td>no</td>\n",
" <td>female</td>\n",
" <td>48</td>\n",
" <td>90</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>right</td>\n",
" <td>no</td>\n",
" <td>male</td>\n",
" <td>75</td>\n",
" <td>130</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>right</td>\n",
" <td>yes</td>\n",
" <td>male</td>\n",
" <td>68</td>\n",
" <td>140</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>right</td>\n",
" <td>no</td>\n",
" <td>male</td>\n",
" <td>100</td>\n",
" <td>80</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>right</td>\n",
" <td>no</td>\n",
" <td>female</td>\n",
" <td>40</td>\n",
" <td>94</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>right</td>\n",
" <td>no</td>\n",
" <td>female</td>\n",
" <td>90</td>\n",
" <td>110</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>left</td>\n",
" <td>no</td>\n",
" <td>male</td>\n",
" <td>88</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>right</td>\n",
" <td>yes</td>\n",
" <td>female</td>\n",
" <td>76</td>\n",
" <td>160</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hand Smoke sex weight IQ\n",
"0 right yes male 80 100\n",
"1 left yes female 50 120\n",
"2 left no female 48 90\n",
"3 right no male 75 130\n",
"4 right yes male 68 140\n",
"5 right no male 100 80\n",
"6 right no female 40 94\n",
"7 right no female 90 110\n",
"8 left no male 88 100\n",
"9 right yes female 76 160"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 走右手习惯,是否抽烟,性别,对体重,智商,有一定影响\n",
"\n",
"df = DataFrame({'Hand':['right','left','left','right','right','right','right','right','left','right'],\n",
" 'Smoke':['yes','yes','no','no','yes','no','no','no','no','yes'],\n",
" 'sex':['male','female','female','male','male','male','female','female','male','female'],\n",
" 'weight':[80,50,48,75,68,100,40,90,88,76],\n",
" 'IQ':[100,120,90,130,140,80,94,110,100,160]})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 分组聚合查看规律,某一条件下规律"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>weight</th>\n",
" <th>IQ</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Hand</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>left</th>\n",
" <td>62.0</td>\n",
" <td>103.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>right</th>\n",
" <td>75.6</td>\n",
" <td>116.3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" weight IQ\n",
"Hand \n",
"left 62.0 103.3\n",
"right 75.6 116.3"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = df.groupby(by = ['Hand'])[['weight','IQ']].mean().round(1)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>weight</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Hand</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>left</th>\n",
" <td>62.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>right</th>\n",
" <td>75.6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" weight\n",
"Hand \n",
"left 62.0\n",
"right 75.6"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(by = ['Hand'])[['weight']].apply(np.mean).round(1)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"df2 = df.groupby(by = ['Hand'])[['weight']].transform(np.mean).round(1)\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>weight_mean</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>62.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>62.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>62.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>75.6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" weight_mean\n",
"0 75.6\n",
"1 62.0\n",
"2 62.0\n",
"3 75.6\n",
"4 75.6\n",
"5 75.6\n",
"6 75.6\n",
"7 75.6\n",
"8 62.0\n",
"9 75.6"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = df2.add_suffix('_mean')\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hand</th>\n",
" <th>Smoke</th>\n",
" <th>sex</th>\n",
" <th>weight</th>\n",
" <th>IQ</th>\n",
" <th>weight_mean</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>right</td>\n",
" <td>yes</td>\n",
" <td>male</td>\n",
" <td>80</td>\n",
" <td>100</td>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>left</td>\n",
" <td>yes</td>\n",
" <td>female</td>\n",
" <td>50</td>\n",
" <td>120</td>\n",
" <td>62.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>left</td>\n",
" <td>no</td>\n",
" <td>female</td>\n",
" <td>48</td>\n",
" <td>90</td>\n",
" <td>62.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>right</td>\n",
" <td>no</td>\n",
" <td>male</td>\n",
" <td>75</td>\n",
" <td>130</td>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>right</td>\n",
" <td>yes</td>\n",
" <td>male</td>\n",
" <td>68</td>\n",
" <td>140</td>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>right</td>\n",
" <td>no</td>\n",
" <td>male</td>\n",
" <td>100</td>\n",
" <td>80</td>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>right</td>\n",
" <td>no</td>\n",
" <td>female</td>\n",
" <td>40</td>\n",
" <td>94</td>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>right</td>\n",
" <td>no</td>\n",
" <td>female</td>\n",
" <td>90</td>\n",
" <td>110</td>\n",
" <td>75.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>left</td>\n",
" <td>no</td>\n",
" <td>male</td>\n",
" <td>88</td>\n",
" <td>100</td>\n",
" <td>62.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>right</td>\n",
" <td>yes</td>\n",
" <td>female</td>\n",
" <td>76</td>\n",
" <td>160</td>\n",
" <td>75.6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hand Smoke sex weight IQ weight_mean\n",
"0 right yes male 80 100 75.6\n",
"1 left yes female 50 120 62.0\n",
"2 left no female 48 90 62.0\n",
"3 right no male 75 130 75.6\n",
"4 right yes male 68 140 75.6\n",
"5 right no male 100 80 75.6\n",
"6 right no female 40 94 75.6\n",
"7 right no female 90 110 75.6\n",
"8 left no male 88 100 62.0\n",
"9 right yes female 76 160 75.6"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3 = df.merge(df2,left_index=True,right_index=True)\n",
"df3"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Hand\n",
"left ([3, 3], [62.0, 103.3])\n",
"right ([7, 7], [75.6, 116.3])\n",
"dtype: object"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def count(x):\n",
" \n",
" return (x.count(),x.mean().round(1))\n",
"\n",
"df.groupby(by = ['Hand'])[['weight','IQ']].apply(count)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>IQ</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Hand</th>\n",
" <th>sex</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">left</th>\n",
" <th>female</th>\n",
" <td>120</td>\n",
" </tr>\n",
" <tr>\n",
" <th>male</th>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">right</th>\n",
" <th>female</th>\n",
" <td>160</td>\n",
" </tr>\n",
" <tr>\n",
" <th>male</th>\n",
" <td>140</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IQ\n",
"Hand sex \n",
"left female 120\n",
" male 100\n",
"right female 160\n",
" male 140"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(by = ['Hand','sex'])[['IQ']].max()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000019E24051EF0>"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = df.groupby(by = ['Hand'])['IQ','weight']\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"2\" halign=\"left\">IQ</th>\n",
" <th colspan=\"2\" halign=\"left\">weight</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>max</th>\n",
" <th>mean</th>\n",
" <th>max</th>\n",
" <th>mean</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Hand</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>left</th>\n",
" <td>120</td>\n",
" <td>103.3</td>\n",
" <td>88</td>\n",
" <td>62.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>right</th>\n",
" <td>160</td>\n",
" <td>116.3</td>\n",
" <td>100</td>\n",
" <td>75.6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IQ weight \n",
" max mean max mean\n",
"Hand \n",
"left 120 103.3 88 62.0\n",
"right 160 116.3 100 75.6"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.agg(['max','mean']).round(1)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>IQ</th>\n",
" <th>weight</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Hand</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>left</th>\n",
" <td>120</td>\n",
" <td>62.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>right</th>\n",
" <td>160</td>\n",
" <td>75.6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IQ weight\n",
"Hand \n",
"left 120 62.0\n",
"right 160 75.6"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.agg({'IQ':'max','weight':'mean'}).round(1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}