資料的分群(groupby)運算¶
在這個網頁中,我們介紹一下 .groupby() 的使用。
In [ ]:
Copied!
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
產生虛擬資料集¶
In [ ]:
Copied!
keys = np.random.choice(['A','B','C'], 20)
keys = np.random.choice(['A','B','C'], 20)
In [ ]:
Copied!
keys
keys
Out[ ]:
array(['B', 'C', 'C', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'A', 'B', 'B',
'A', 'C', 'B', 'C', 'C', 'A', 'B'], dtype='<U1')
In [ ]:
Copied!
values = np.random.randint(0, 10, 20)
values = np.random.randint(0, 10, 20)
In [ ]:
Copied!
df = pd.DataFrame(zip(keys, values))
df = pd.DataFrame(zip(keys, values))
In [ ]:
Copied!
df
df
Out[ ]:
| 0 | 1 | |
|---|---|---|
| 0 | B | 4 |
| 1 | C | 7 |
| 2 | C | 5 |
| 3 | B | 9 |
| 4 | B | 7 |
| 5 | B | 4 |
| 6 | B | 9 |
| 7 | B | 3 |
| 8 | C | 5 |
| 9 | C | 0 |
| 10 | A | 1 |
| 11 | B | 4 |
| 12 | B | 2 |
| 13 | A | 2 |
| 14 | C | 1 |
| 15 | B | 2 |
| 16 | C | 6 |
| 17 | C | 2 |
| 18 | A | 5 |
| 19 | B | 8 |
In [ ]:
Copied!
df.columns = ['keys','values']
df.columns = ['keys','values']
In [ ]:
Copied!
df
df
Out[ ]:
| keys | values | |
|---|---|---|
| 0 | B | 4 |
| 1 | C | 7 |
| 2 | C | 5 |
| 3 | B | 9 |
| 4 | B | 7 |
| 5 | B | 4 |
| 6 | B | 9 |
| 7 | B | 3 |
| 8 | C | 5 |
| 9 | C | 0 |
| 10 | A | 1 |
| 11 | B | 4 |
| 12 | B | 2 |
| 13 | A | 2 |
| 14 | C | 1 |
| 15 | B | 2 |
| 16 | C | 6 |
| 17 | C | 2 |
| 18 | A | 5 |
| 19 | B | 8 |
In [ ]:
Copied!
df.sort_values('keys')
df.sort_values('keys')
Out[ ]:
| keys | values | |
|---|---|---|
| 13 | A | 2 |
| 10 | A | 1 |
| 18 | A | 5 |
| 0 | B | 4 |
| 15 | B | 2 |
| 12 | B | 2 |
| 11 | B | 4 |
| 19 | B | 8 |
| 6 | B | 9 |
| 5 | B | 4 |
| 4 | B | 7 |
| 3 | B | 9 |
| 7 | B | 3 |
| 8 | C | 5 |
| 2 | C | 5 |
| 14 | C | 1 |
| 1 | C | 7 |
| 16 | C | 6 |
| 17 | C | 2 |
| 9 | C | 0 |
.groupby()¶
In [ ]:
Copied!
grps = df.groupby('keys')
grps = df.groupby('keys')
In [ ]:
Copied!
grps.get_group('A')
grps.get_group('A')
Out[ ]:
| keys | values | |
|---|---|---|
| 10 | A | 1 |
| 13 | A | 2 |
| 18 | A | 5 |
In [ ]:
Copied!
grps.agg(['count', sum])
grps.agg(['count', sum])
Out[ ]:
| values | ||
|---|---|---|
| count | sum | |
| keys | ||
| A | 3 | 8 |
| B | 10 | 52 |
| C | 7 | 26 |
In [ ]:
Copied!
tuple(grps)
tuple(grps)
Out[ ]:
(('A',
keys values
10 A 1
13 A 2
18 A 5),
('B',
keys values
0 B 4
3 B 9
4 B 7
5 B 4
6 B 9
7 B 3
11 B 4
12 B 2
15 B 2
19 B 8),
('C',
keys values
1 C 7
2 C 5
8 C 5
9 C 0
14 C 1
16 C 6
17 C 2))
In [ ]:
Copied!
dict(tuple(grps))
dict(tuple(grps))
Out[ ]:
{'A': keys values
10 A 1
13 A 2
18 A 5,
'B': keys values
0 B 4
3 B 9
4 B 7
5 B 4
6 B 9
7 B 3
11 B 4
12 B 2
15 B 2
19 B 8,
'C': keys values
1 C 7
2 C 5
8 C 5
9 C 0
14 C 1
16 C 6
17 C 2}
In [ ]:
Copied!
df['keys'].unique()
df['keys'].unique()
Out[ ]:
array(['B', 'C', 'A'], dtype=object)