import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame
%precision 3
'%.3f'
Pandas
代表的なデータ構造の Series から。
ラベルとかの情報を含む配列みたいな感じと認識。
データ型
sample_data = pd.Series([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ])
print(sample_data)
0 0
1 10
2 20
3 30
4 40
5 50
6 60
7 70
8 80
9 90
dtype: int64
sample_indexed_data = pd.Series(
[0, 10, 20, 30, 40, 50, 60, 70, 80, 90],
index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
)
sample_indexed_data
a 0
b 10
c 20
d 30
e 40
f 50
g 60
h 70
i 80
j 90
dtype: int64
print(f'values={ sample_indexed_data.values }')
print(f'indexes={ sample_indexed_data.index }')
values=[ 0 10 20 30 40 50 60 70 80 90]
indexes=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
Excel を想像すると、Series は横軸(列)の定義だと考えることができそう。
ではシートに当たる軸にはDataFrameを使用するっぽい。
attri_data1 = {
'ID': ['100', '101', '102', '103', '104'],
'City': ['Tokyo', 'Osaka', 'Kyoto', 'Hookaido', 'Tokyo'],
'Birth_year': [1990, 1989, 1992, 1997, 1982],
'Name': ['Hiroshi', 'Akiko', 'Yuki', 'Satoru', 'Stive']
}
attri_data_frame1 = DataFrame(attri_data1)
attri_data_frame1
|
ID |
City |
Birth_year |
Name |
0 |
100 |
Tokyo |
1990 |
Hiroshi |
1 |
101 |
Osaka |
1989 |
Akiko |
2 |
102 |
Kyoto |
1992 |
Yuki |
3 |
103 |
Hookaido |
1997 |
Satoru |
4 |
104 |
Tokyo |
1982 |
Stive |
attri_data_drame_index1 = DataFrame(attri_data1, index=['a', 'b', 'c', 'd', 'e'])
attri_data_drame_index1
|
ID |
City |
Birth_year |
Name |
a |
100 |
Tokyo |
1990 |
Hiroshi |
b |
101 |
Osaka |
1989 |
Akiko |
c |
102 |
Kyoto |
1992 |
Yuki |
d |
103 |
Hookaido |
1997 |
Satoru |
e |
104 |
Tokyo |
1982 |
Stive |
DataFrame 操作系
attri_data_frame1.T
|
0 |
1 |
2 |
3 |
4 |
ID |
100 |
101 |
102 |
103 |
104 |
City |
Tokyo |
Osaka |
Kyoto |
Hookaido |
Tokyo |
Birth_year |
1990 |
1989 |
1992 |
1997 |
1982 |
Name |
Hiroshi |
Akiko |
Yuki |
Satoru |
Stive |
attri_data_frame1[['City', 'Name']]
|
City |
Name |
0 |
Tokyo |
Hiroshi |
1 |
Osaka |
Akiko |
2 |
Kyoto |
Yuki |
3 |
Hookaido |
Satoru |
4 |
Tokyo |
Stive |
attri_data_frame1[attri_data_frame1['City'] == 'Tokyo']
|
ID |
City |
Birth_year |
Name |
0 |
100 |
Tokyo |
1990 |
Hiroshi |
4 |
104 |
Tokyo |
1982 |
Stive |
attri_data_frame1['City'] == 'Tokyo'
0 True
1 False
2 False
3 False
4 True
Name: City, dtype: bool
ふと思った、この出力って Series じゃね?
てことは
attri_data_frame1[Series([True, True, False, False, False])]
|
ID |
City |
Birth_year |
Name |
0 |
100 |
Tokyo |
1990 |
Hiroshi |
1 |
101 |
Osaka |
1989 |
Akiko |
なるほどね。
他にも
attri_data_frame1[attri_data_frame1['City'].isin(['Tokyo', 'Osaka'])]
|
ID |
City |
Birth_year |
Name |
0 |
100 |
Tokyo |
1990 |
Hiroshi |
1 |
101 |
Osaka |
1989 |
Akiko |
4 |
104 |
Tokyo |
1982 |
Stive |
attri_data_frame1[attri_data_frame1['Birth_year'] < 1990]
|
ID |
City |
Birth_year |
Name |
1 |
101 |
Osaka |
1989 |
Akiko |
4 |
104 |
Tokyo |
1982 |
Stive |
データ削除、結合、集計、変更
attri_data_frame1.drop(['Birth_year'], axis = 1)
|
ID |
City |
Name |
0 |
100 |
Tokyo |
Hiroshi |
1 |
101 |
Osaka |
Akiko |
2 |
102 |
Kyoto |
Yuki |
3 |
103 |
Hookaido |
Satoru |
4 |
104 |
Tokyo |
Stive |
attri_data_frame1
|
ID |
City |
Birth_year |
Name |
0 |
100 |
Tokyo |
1990 |
Hiroshi |
1 |
101 |
Osaka |
1989 |
Akiko |
2 |
102 |
Kyoto |
1992 |
Yuki |
3 |
103 |
Hookaido |
1997 |
Satoru |
4 |
104 |
Tokyo |
1982 |
Stive |
attri_data2 = {
'ID': ['100', '101', '102', '105', '107'],
'Math': [50, 43, 33, 76, 98],
'English': [90, 30, 20, 50, 30],
'Gender': ['M', 'F', 'F', 'M', 'M']
}
attri_data_frame2 = DataFrame(attri_data2)
attri_data_frame2
|
ID |
Math |
English |
Gender |
0 |
100 |
50 |
90 |
M |
1 |
101 |
43 |
30 |
F |
2 |
102 |
33 |
20 |
F |
3 |
105 |
76 |
50 |
M |
4 |
107 |
98 |
30 |
M |
pd.merge(attri_data_frame1, attri_data_frame2)
|
ID |
City |
Birth_year |
Name |
Math |
English |
Gender |
0 |
100 |
Tokyo |
1990 |
Hiroshi |
50 |
90 |
M |
1 |
101 |
Osaka |
1989 |
Akiko |
43 |
30 |
F |
2 |
102 |
Kyoto |
1992 |
Yuki |
33 |
20 |
F |
なるほど、結合ルールは積集合か。
attri_data_frame2.groupby('Gender')['Math'].mean()
Gender
F 38.000000
M 74.666667
Name: Math, dtype: float64
attri_data_frame1.isin(['Tokyo'])
|
ID |
City |
Birth_year |
Name |
0 |
False |
True |
False |
False |
1 |
False |
False |
False |
False |
2 |
False |
False |
False |
False |
3 |
False |
False |
False |
False |
4 |
False |
True |
False |
False |
attri_data_frame1['Name'] = np.nan
attri_data_frame1.isnull()
|
ID |
City |
Birth_year |
Name |
0 |
False |
False |
False |
True |
1 |
False |
False |
False |
True |
2 |
False |
False |
False |
True |
3 |
False |
False |
False |
True |
4 |
False |
False |
False |
True |
attri_data_frame1.isnull().sum()
ID 0
City 0
Birth_year 0
Name 5
dtype: int64
あれーこんなメソッドなかったっけシリーズ
attri_data_frame1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
ID 5 non-null object
City 5 non-null object
Birth_year 5 non-null int64
Name 0 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 240.0+ bytes
attri_data_frame1.describe(include=['O'])
|
ID |
City |
count |
5 |
5 |
unique |
5 |
4 |
top |
100 |
Tokyo |
freq |
1 |
2 |
練習問題
attri_data1 = {
'ID': ['1', '2', '3', '4', '5'],
'Sex':['F', 'F', 'M', 'M', 'F'],
'Money': [1000, 2000, 500, 300, 700],
'Name': ['Saito', 'Horie', 'Kondo', 'Kawada', 'Matsubara']
}
attri_data_frame1 = DataFrame(attri_data1)
attri_data_frame1[attri_data_frame1['Money'] > 500]
|
ID |
Sex |
Money |
Name |
0 |
1 |
F |
1000 |
Saito |
1 |
2 |
F |
2000 |
Horie |
4 |
5 |
F |
700 |
Matsubara |
attri_data_frame1[['Sex', 'Money']].groupby(['Sex'], as_index=False).mean()
|
Sex |
Money |
0 |
F |
1233.333333 |
1 |
M |
400.000000 |
attri_data2 = {
'ID': ['3', '4', '7'],
'Math': [60, 30, 40],
'English': [80, 20, 30]
}
attri_data_frame2 = DataFrame(attri_data2)
combined = pd.merge(attri_data_frame1, attri_data_frame2)
combined.mean()
ID 17.0
Money 400.0
Math 45.0
English 50.0
dtype: float64