Pandas Cheatsheet
Contents
Pandas Cheatsheet#
Pandas 는 테이블 형태의 자료를 쉽고 직관적으로 다룰 수 있는 Python 라이브러리 입니다. R, Spark 과 마찬가지로 DataFrame 형태의 데이터 조작 방식을 제공하기 때문에 기존에 다른 언어에서의 DataFrame 형태의 API를 사용했던 사람이라면 쉽게 적응하여 사용할 수 있습니다.
참조
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
print("numpy ver={}".format(np.__version__))
print("pandas ver={}".format(pd.__version__))
numpy ver=1.23.1
pandas ver=1.2.4
Series, DataFrame#
Series: 1차원 데이터(리스트 형태)를 표현하는 구조
DataFrame: 2차원 데이터(테이블 형태)를 표현하는 구조
# Series
s = pd.Series([1,3,5,np.nan,6,8])
print(s)
# DataFrame by Numpy Array
index = pd.date_range('20130101', periods=6)
print(index)
columns = ['A','B','C','D']
print(columns)
data = np.random.randn(6, 4)
print(data)
df = pd.DataFrame(data, index=index, columns=columns)
print(df)
# DataFrome by Dictonary
df2 = pd.DataFrame({
'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'
})
print(df2)
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
['A', 'B', 'C', 'D']
[[-1.71938116 1.28631837 0.24284392 -0.93244523]
[-1.19110208 -0.35500583 0.82614074 -0.08154429]
[-0.24377431 0.63717415 -0.63062627 -0.08026694]
[-0.70823093 1.7716175 0.1022816 1.04184055]
[-1.04917555 -0.43138715 -0.75327522 0.31933443]
[-0.81170268 -1.28165698 1.15325045 -0.43975833]]
A B C D
2013-01-01 -1.719381 1.286318 0.242844 -0.932445
2013-01-02 -1.191102 -0.355006 0.826141 -0.081544
2013-01-03 -0.243774 0.637174 -0.630626 -0.080267
2013-01-04 -0.708231 1.771618 0.102282 1.041841
2013-01-05 -1.049176 -0.431387 -0.753275 0.319334
2013-01-06 -0.811703 -1.281657 1.153250 -0.439758
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
CSV Read/Write#
# CSV
df = pd.DataFrame({
'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'
})
print(df)
path_csv = "/tmp/foo.csv"
df.to_csv(path_csv, index=False)
df = pd.read_csv(path_csv, header=0)
print(df)
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
Excel Read/Write#
# Excel
# openpyxl 패키지 설치 필요
# pip install openpyxl
df = pd.DataFrame({
'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'
})
print(df)
path_csv = "/tmp/foo.xlsx"
df.to_excel(path_csv, index=False)
df = pd.read_excel(path_csv, header=0)
print(df)
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
A B C D E F
0 1 2013-01-02 1 3 test foo
1 1 2013-01-02 1 3 train foo
2 1 2013-01-02 1 3 test foo
3 1 2013-01-02 1 3 train foo
JSON Read/Write#
df = pd.DataFrame([['a', 'b'], ['c', 'd']], index=['row 1', 'row 2'], columns=['col 1', 'col 2'])
print(df)
path_json = "/tmp/foo.json"
# '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
#df.to_json(path_json, orient='index')
# '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
df.to_json(path_json, orient='records')
#df = pd.read_json(path_json, orient='index')
df = pd.read_json(path_json, orient='records')
print(df)
col 1 col 2
row 1 a b
row 2 c d
col 1 col 2
0 a b
1 c d
DataFrame 정보보기#
df = pd.DataFrame(data, index=index, columns=columns)
print(df)
# DataFrame 컬럼 정보
print(df2.dtypes)
print(df2.info())
A B C D
2013-01-01 -1.719381 1.286318 0.242844 -0.932445
2013-01-02 -1.191102 -0.355006 0.826141 -0.081544
2013-01-03 -0.243774 0.637174 -0.630626 -0.080267
2013-01-04 -0.708231 1.771618 0.102282 1.041841
2013-01-05 -1.049176 -0.431387 -0.753275 0.319334
2013-01-06 -0.811703 -1.281657 1.153250 -0.439758
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 4 non-null float64
1 B 4 non-null datetime64[ns]
2 C 4 non-null float32
3 D 4 non-null int32
4 E 4 non-null category
5 F 4 non-null object
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes
None
# 데이터 살펴보기
print(df.head())
print(df.tail(3))
# DataFrame 요소 살펴보기
print(df.shape)
print(df.index)
print(df.columns)
print(df.values)
print(type(df.values))
A B C D
2013-01-01 -1.719381 1.286318 0.242844 -0.932445
2013-01-02 -1.191102 -0.355006 0.826141 -0.081544
2013-01-03 -0.243774 0.637174 -0.630626 -0.080267
2013-01-04 -0.708231 1.771618 0.102282 1.041841
2013-01-05 -1.049176 -0.431387 -0.753275 0.319334
A B C D
2013-01-04 -0.708231 1.771618 0.102282 1.041841
2013-01-05 -1.049176 -0.431387 -0.753275 0.319334
2013-01-06 -0.811703 -1.281657 1.153250 -0.439758
(6, 4)
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')
[[-1.71938116 1.28631837 0.24284392 -0.93244523]
[-1.19110208 -0.35500583 0.82614074 -0.08154429]
[-0.24377431 0.63717415 -0.63062627 -0.08026694]
[-0.70823093 1.7716175 0.1022816 1.04184055]
[-1.04917555 -0.43138715 -0.75327522 0.31933443]
[-0.81170268 -1.28165698 1.15325045 -0.43975833]]
<class 'numpy.ndarray'>
DataFrame 조작하기#
Copy#
# Dataframe 은 주소 참조를 하기 때문에 원본 데이터가 수정되지 않기를 바란다면
# copy() 함수를 통해 명시적 복사를 진행해야 함
df2 = df.copy()
# 복사 후 달라진 주소 확인
print(f"df addr={hex(id(df))}, df2 addr={hex(id(df2))}")
df addr=0x125e17820, df2 addr=0x125dbf9d0
Selection#
# 행 슬라이싱
print(df[0:2])
# 열 선택
print(df['A'].head())
# 열 다중선택
print(df[['A', 'B']].head())
# 데이터 타입으로 열 선택
df3 = pd.DataFrame({'a': [1, 2] * 3,
'b': [True, False] * 3,
'c': [1.0, 2.0] * 3})
print(df3.select_dtypes(include=['bool']))
print(df3.select_dtypes(include=['float64']))
print(df3.select_dtypes(include=['int64']))
A B C D
2013-01-01 -1.719381 1.286318 0.242844 -0.932445
2013-01-02 -1.191102 -0.355006 0.826141 -0.081544
2013-01-01 -1.719381
2013-01-02 -1.191102
2013-01-03 -0.243774
2013-01-04 -0.708231
2013-01-05 -1.049176
Freq: D, Name: A, dtype: float64
A B
2013-01-01 -1.719381 1.286318
2013-01-02 -1.191102 -0.355006
2013-01-03 -0.243774 0.637174
2013-01-04 -0.708231 1.771618
2013-01-05 -1.049176 -0.431387
b
0 True
1 False
2 True
3 False
4 True
5 False
c
0 1.0
1 2.0
2 1.0
3 2.0
4 1.0
5 2.0
a
0 1
1 2
2 1
3 2
4 1
5 2
# Label 을 이용한 선택 .loc()
# 셀 선택
print(df.loc['2013-01-01', 'A'])
# 행 선택
print(df.loc['2013-01-01'])
# 행 슬라이싱
print(df.loc['2013-01-01':'2013-01-03'])
# 열 선택
print(df.loc[:, 'A'].head())
# 열 다중 선택
print(df.loc[:, ['A', 'B']].head())
# 행, 열 다중 선택
print(df.loc['2013-01-01':'2013-01-03', ['A', 'B']])
-0.4065751958736635
A -0.406575
B 1.154904
C 0.283025
D -0.533548
Name: 2013-01-01 00:00:00, dtype: float64
A B C D
2013-01-01 -0.406575 1.154904 0.283025 -0.533548
2013-01-02 0.310146 -0.977560 0.045798 1.600883
2013-01-03 -1.454818 -0.212861 1.080810 -0.198280
2013-01-01 -0.406575
2013-01-02 0.310146
2013-01-03 -1.454818
2013-01-04 -1.109016
2013-01-05 -0.183504
Freq: D, Name: A, dtype: float64
A B
2013-01-01 -0.406575 1.154904
2013-01-02 0.310146 -0.977560
2013-01-03 -1.454818 -0.212861
2013-01-04 -1.109016 -1.420387
2013-01-05 -0.183504 -0.079999
A B
2013-01-01 -0.406575 1.154904
2013-01-02 0.310146 -0.977560
2013-01-03 -1.454818 -0.212861
# Position을 이용한 선택 .iloc()
# 셀 선택
print(df.iloc[0, 0])
# 행 선택
print(df.iloc[0])
# 행 슬라이싱
print(df.iloc[0:3])
# 열 선택
print(df.iloc[:, 0].head())
# 열 다중 선택
print(df.iloc[:, 0:2].head())
# 행, 열 다중 선택
print(df.iloc[0:3, 0:2])
print(df.iloc[[1, 3], [0, 1]])
-0.4065751958736635
A -0.406575
B 1.154904
C 0.283025
D -0.533548
Name: 2013-01-01 00:00:00, dtype: float64
A B C D
2013-01-01 -0.406575 1.154904 0.283025 -0.533548
2013-01-02 0.310146 -0.977560 0.045798 1.600883
2013-01-03 -1.454818 -0.212861 1.080810 -0.198280
2013-01-01 -0.406575
2013-01-02 0.310146
2013-01-03 -1.454818
2013-01-04 -1.109016
2013-01-05 -0.183504
Freq: D, Name: A, dtype: float64
A B
2013-01-01 -0.406575 1.154904
2013-01-02 0.310146 -0.977560
2013-01-03 -1.454818 -0.212861
2013-01-04 -1.109016 -1.420387
2013-01-05 -0.183504 -0.079999
A B
2013-01-01 -0.406575 1.154904
2013-01-02 0.310146 -0.977560
2013-01-03 -1.454818 -0.212861
A B
2013-01-02 0.310146 -0.977560
2013-01-04 -1.109016 -1.420387
# 조건을 이용한 선택
# 열 단위
print(df[df['A'] > 0][['B', 'C']])
print(df.loc[df['A'] > 0, ['B', 'C']])
# DataFrame 단위
print(df[df > 0])
# Boolean 리스트를 이용
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
index = df2['E'].isin(['two','four'])
print(index)
print(df2[df2['E'].isin(['two','four'])])
print(df2.loc[df2['E'].isin(['two','four'])])
B C
2013-01-02 -0.97756 0.045798
B C
2013-01-02 -0.97756 0.045798
A B C D
2013-01-01 NaN 1.154904 0.283025 NaN
2013-01-02 0.310146 NaN 0.045798 1.600883
2013-01-03 NaN NaN 1.080810 NaN
2013-01-04 NaN NaN NaN NaN
2013-01-05 NaN NaN NaN NaN
2013-01-06 NaN NaN NaN 0.090299
2013-01-01 False
2013-01-02 False
2013-01-03 True
2013-01-04 False
2013-01-05 True
2013-01-06 False
Freq: D, Name: E, dtype: bool
A B C D E
2013-01-03 -1.454818 -0.212861 1.080810 -0.198280 two
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163 four
A B C D E
2013-01-03 -1.454818 -0.212861 1.080810 -0.198280 two
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163 four
Setting#
# 값 설정
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
df['F'] = s1
df['G'] = s1
print(df)
# 행 삭제: index 기준
df = df.drop([pd.to_datetime('2013-01-04')], axis=0)
print(df)
# # 열 삭제: 열 이름기준
df = df.drop(['G'], axis=1)
print(df)
# Label에 의한 값 설정
df.at['2013-01-02', 'F'] = 0
# Position에 의한 값 설정
df.iat[2,-1] = 1
print(df)
# 배열을 이용한 값 설정
df.loc['2013-01-02':, 'F'] = np.array([5] * (len(df) - 1))
print(df)
# 조건을 통한 값 설정
df2 = df.copy()
df2[df2 > 0] = -df2
print(df2)
A B C D F G
2013-01-01 -0.406575 1.154904 0.283025 -0.533548 NaN NaN
2013-01-02 0.310146 -0.977560 0.045798 1.600883 1.0 1.0
2013-01-03 -1.454818 -0.212861 1.080810 -0.198280 2.0 2.0
2013-01-04 -1.109016 -1.420387 -0.340734 -0.438138 3.0 3.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163 4.0 4.0
2013-01-06 -0.133029 -0.770604 -2.005459 0.090299 5.0 5.0
A B C D F G
2013-01-01 -0.406575 1.154904 0.283025 -0.533548 NaN NaN
2013-01-02 0.310146 -0.977560 0.045798 1.600883 1.0 1.0
2013-01-03 -1.454818 -0.212861 1.080810 -0.198280 2.0 2.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163 4.0 4.0
2013-01-06 -0.133029 -0.770604 -2.005459 0.090299 5.0 5.0
A B C D F
2013-01-01 -0.406575 1.154904 0.283025 -0.533548 NaN
2013-01-02 0.310146 -0.977560 0.045798 1.600883 1.0
2013-01-03 -1.454818 -0.212861 1.080810 -0.198280 2.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163 4.0
2013-01-06 -0.133029 -0.770604 -2.005459 0.090299 5.0
A B C D F
2013-01-01 -0.406575 1.154904 0.283025 -0.533548 NaN
2013-01-02 0.310146 -0.977560 0.045798 1.600883 0.0
2013-01-03 -1.454818 -0.212861 1.080810 -0.198280 1.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163 4.0
2013-01-06 -0.133029 -0.770604 -2.005459 0.090299 5.0
A B C D F
2013-01-01 -0.406575 1.154904 0.283025 -0.533548 NaN
2013-01-02 0.310146 -0.977560 0.045798 1.600883 5.0
2013-01-03 -1.454818 -0.212861 1.080810 -0.198280 5.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163 5.0
2013-01-06 -0.133029 -0.770604 -2.005459 0.090299 5.0
A B C D F
2013-01-01 -0.406575 -1.154904 -0.283025 -0.533548 NaN
2013-01-02 -0.310146 -0.977560 -0.045798 -1.600883 -5.0
2013-01-03 -1.454818 -0.212861 -1.080810 -0.198280 -5.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163 -5.0
2013-01-06 -0.133029 -0.770604 -2.005459 -0.090299 -5.0
Missing Value#
index = pd.date_range('20130101', periods=6)
df1 = df.reindex(index=index[0:4], columns=list(df.columns) + ['E'])
df1.loc[index[0]:index[1],'E'] = 1
print(df1)
# 결측치 제거하기
print(df1.dropna(how='any'))
# 결측치 채워넣기
print(df1.fillna(value=5))
# 결측치 위치보기
print(df1.isna())
print(df1.isnull()) # isna() 와 같음
# 결측치는 제외한 위치보기
print(df1.notna())
print(df1.notnull())
A B C D F E
2013-01-01 0.523658 0.548980 -0.968327 0.487522 NaN 1.0
2013-01-02 1.086258 0.904897 -0.446926 -0.504047 5.0 1.0
2013-01-03 0.236893 -0.094955 -0.280539 -0.162157 5.0 NaN
2013-01-04 0.438455 0.918198 -0.338696 -0.565120 5.0 NaN
A B C D F E
2013-01-02 1.086258 0.904897 -0.446926 -0.504047 5.0 1.0
A B C D F E
2013-01-01 0.523658 0.548980 -0.968327 0.487522 5.0 1.0
2013-01-02 1.086258 0.904897 -0.446926 -0.504047 5.0 1.0
2013-01-03 0.236893 -0.094955 -0.280539 -0.162157 5.0 5.0
2013-01-04 0.438455 0.918198 -0.338696 -0.565120 5.0 5.0
A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True
A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True
A B C D F E
2013-01-01 True True True True False True
2013-01-02 True True True True True True
2013-01-03 True True True True True False
2013-01-04 True True True True True False
A B C D F E
2013-01-01 True True True True False True
2013-01-02 True True True True True True
2013-01-03 True True True True True False
2013-01-04 True True True True True False
Operation#
# DataFrame 통계확인
print(df.describe())
# 열 별 통계
print(df.mean())
# 행 별 통계
print(df.mean(1))
# DataFrame 연산
s = pd.Series([1,3,5,np.nan,6,8], index=index).shift(2)
print(s)
print(df.sub(s, axis='index'))
A B C D F
count 6.000000 6.000000 6.000000 6.000000 5.0
mean 0.332269 0.574651 -0.414428 -0.299727 5.0
std 0.507378 0.623868 0.296350 0.425299 0.0
min -0.465602 -0.211383 -0.968327 -0.685344 5.0
25% 0.189688 0.066029 -0.425532 -0.549852 5.0
50% 0.337674 0.726939 -0.350023 -0.436632 5.0
75% 0.502357 0.914873 -0.295078 -0.213922 5.0
max 1.086258 1.382171 -0.090733 0.487522 5.0
A 0.332269
B 0.574651
C -0.414428
D -0.299727
F 5.000000
dtype: float64
2013-01-01 0.147958
2013-01-02 1.208036
2013-01-03 0.939848
2013-01-04 1.090567
2013-01-05 1.165111
2013-01-06 0.709387
Freq: D, dtype: float64
2013-01-01 NaN
2013-01-02 NaN
2013-01-03 1.0
2013-01-04 3.0
2013-01-05 5.0
2013-01-06 NaN
Freq: D, dtype: float64
A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 -0.763107 -1.094955 -1.280539 -1.162157 4.0
2013-01-04 -2.561545 -2.081802 -3.338696 -3.565120 2.0
2013-01-05 -4.826047 -3.617829 -5.361350 -5.369217 0.0
2013-01-06 NaN NaN NaN NaN NaN
# DataFrame Transpose(전치, 열과행치환)
print(df.T)
2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06
A 0.523658 1.086258 0.236893 0.438455 0.173953 -0.465602
B 0.548980 0.904897 -0.094955 0.918198 1.382171 -0.211383
C -0.968327 -0.446926 -0.280539 -0.338696 -0.361350 -0.090733
D 0.487522 -0.504047 -0.162157 -0.565120 -0.369217 -0.685344
F NaN 5.000000 5.000000 5.000000 5.000000 5.000000
# 축별로 정렬
print(df.sort_index()) # 오름차순
print(df.sort_index(axis=1, ascending=False)) #내림차순
# 값별로 정렬
print(df.sort_values(by='B'))
print(df.sort_values(by=['B', 'C'])) # 복수 컬럼
A B C D F
2013-01-01 0.523658 0.548980 -0.968327 0.487522 NaN
2013-01-02 1.086258 0.904897 -0.446926 -0.504047 5.0
2013-01-03 0.236893 -0.094955 -0.280539 -0.162157 5.0
2013-01-04 0.438455 0.918198 -0.338696 -0.565120 5.0
2013-01-05 0.173953 1.382171 -0.361350 -0.369217 5.0
2013-01-06 -0.465602 -0.211383 -0.090733 -0.685344 5.0
F D C B A
2013-01-01 NaN 0.487522 -0.968327 0.548980 0.523658
2013-01-02 5.0 -0.504047 -0.446926 0.904897 1.086258
2013-01-03 5.0 -0.162157 -0.280539 -0.094955 0.236893
2013-01-04 5.0 -0.565120 -0.338696 0.918198 0.438455
2013-01-05 5.0 -0.369217 -0.361350 1.382171 0.173953
2013-01-06 5.0 -0.685344 -0.090733 -0.211383 -0.465602
A B C D F
2013-01-06 -0.465602 -0.211383 -0.090733 -0.685344 5.0
2013-01-03 0.236893 -0.094955 -0.280539 -0.162157 5.0
2013-01-01 0.523658 0.548980 -0.968327 0.487522 NaN
2013-01-02 1.086258 0.904897 -0.446926 -0.504047 5.0
2013-01-04 0.438455 0.918198 -0.338696 -0.565120 5.0
2013-01-05 0.173953 1.382171 -0.361350 -0.369217 5.0
A B C D F
2013-01-06 -0.465602 -0.211383 -0.090733 -0.685344 5.0
2013-01-03 0.236893 -0.094955 -0.280539 -0.162157 5.0
2013-01-01 0.523658 0.548980 -0.968327 0.487522 NaN
2013-01-02 1.086258 0.904897 -0.446926 -0.504047 5.0
2013-01-04 0.438455 0.918198 -0.338696 -0.565120 5.0
2013-01-05 0.173953 1.382171 -0.361350 -0.369217 5.0
# apply: 데이터에 함수 적용
# 열별 적용
print(df.apply(np.cumsum))
print(df.apply(lambda x: x.max() - x.min()))
# 행별 적용
print(df.apply(np.cumsum, axis=1))
print(df.apply(lambda x: x.max() - x.min(), axis=1))
A B C D F
2013-01-01 0.523658 0.548980 -0.968327 0.487522 NaN
2013-01-02 1.609916 1.453877 -1.415253 -0.016525 5.0
2013-01-03 1.846809 1.358922 -1.695792 -0.178682 10.0
2013-01-04 2.285264 2.277120 -2.034487 -0.743803 15.0
2013-01-05 2.459216 3.659291 -2.395837 -1.113020 20.0
2013-01-06 1.993614 3.447907 -2.486570 -1.798364 25.0
A 1.551860
B 1.593554
C 0.877594
D 1.172866
F 0.000000
dtype: float64
A B C D F
2013-01-01 0.523658 1.072638 0.104311 0.591833 NaN
2013-01-02 1.086258 1.991155 1.544229 1.040182 6.040182
2013-01-03 0.236893 0.141937 -0.138601 -0.300759 4.699241
2013-01-04 0.438455 1.356653 1.017957 0.452837 5.452837
2013-01-05 0.173953 1.556124 1.194774 0.825556 5.825556
2013-01-06 -0.465602 -0.676985 -0.767718 -1.453063 3.546937
2013-01-01 1.517308
2013-01-02 5.504047
2013-01-03 5.280539
2013-01-04 5.565120
2013-01-05 5.369217
2013-01-06 5.685344
Freq: D, dtype: float64
# Histogramming: 데이터 값 세기
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
print(s.value_counts())
0 6
1 6
2 3
3 5
4 3
5 6
6 3
7 6
8 0
9 2
dtype: int64
6 4
3 3
0 1
2 1
5 1
dtype: int64
# 문자열 함수
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s.str.lower())
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
Merge#
상세 참조자료: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
# Concat: 단순히 두 DataFrame을 연결만 하기 때문에 의도한 결과와 같은지 확인이 필요합니다.
df = pd.DataFrame(np.random.randn(10, 3))
a = df[:2]
b = df[3:5]
print(a)
print(b)
# 행 기반 연결
print(pd.concat([a, b]))
print(pd.concat([a, b], ignore_index=True))
# 열 기반 연결
b.columns = [3, 4, 5]
b.reset_index(inplace=True, drop=True)
print(pd.concat([a, b], axis=1))
0 1 2
0 1.312877 0.573515 -0.834520
1 -0.695523 -0.110469 0.302099
0 1 2
3 -0.264594 -1.256058 -0.873582
4 0.836012 1.234542 0.567087
0 1 2
0 1.312877 0.573515 -0.834520
1 -0.695523 -0.110469 0.302099
3 -0.264594 -1.256058 -0.873582
4 0.836012 1.234542 0.567087
0 1 2
0 1.312877 0.573515 -0.834520
1 -0.695523 -0.110469 0.302099
2 -0.264594 -1.256058 -0.873582
3 0.836012 1.234542 0.567087
0 1 2 3 4 5
0 1.312877 0.573515 -0.834520 -0.264594 -1.256058 -0.873582
1 -0.695523 -0.110469 0.302099 0.836012 1.234542 0.567087
# Append: 행 기반 concat 과 같습니다.
df = pd.DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C'])
print(df)
s = df.iloc[1]
print(df.append(s, ignore_index=True))
A B C
0 0.645724 -0.023637 0.529198
1 -1.660055 -0.120990 -0.693897
2 -0.368514 0.095203 0.262218
A B C
0 0.645724 -0.023637 0.529198
1 -1.660055 -0.120990 -0.693897
2 -0.368514 0.095203 0.262218
3 -1.660055 -0.120990 -0.693897
# Join: SQL 에서 지원하는 방식과 유사
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(left)
print(right)
print(pd.merge(left, right, on= 'key'))
key lval
0 foo 1
1 bar 2
key rval
0 foo 4
1 bar 5
key lval rval
0 foo 1 4
1 bar 2 5
# GroupBy: SQL 에서 지원되는 방식과 유사
df = pd.DataFrame({
'A' : ['foo', 'bar', 'foo', 'bar'],
'B' : ['one', 'one', 'two', 'three'],
'C' : np.random.randn(4),
'D' : np.random.randn(4)
})
print(df)
print(df.groupby('A').sum())
print(df.groupby(['A','B']).sum())
A B C D
0 foo one -0.630498 -0.304689
1 bar one 0.374077 -0.940406
2 foo two -1.610434 0.304611
3 bar three -0.281375 -0.808427
C D
A
bar 0.092702 -1.748833
foo -2.240932 -0.000078
C D
A B
bar one 0.374077 -0.940406
three -0.281375 -0.808427
foo one -0.630498 -0.304689
two -1.610434 0.304611
Reshaping#
# Stack
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
('two', 'a'), ('two', 'b')])
s = pd.Series(np.arange(1.0, 5.0), index=index)
print(s)
df = s.unstack()
print(df)
print(df.stack())
# Pivot
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
print(df)
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))
one a 1.0
b 2.0
two a 3.0
b 4.0
dtype: float64
a b
one 1.0 2.0
two 3.0 4.0
one a 1.0
b 2.0
two a 3.0
b 4.0
dtype: float64
A B C D E
0 one A foo -0.680790 -0.787647
1 one B foo 0.602496 0.539262
2 two C foo 0.370922 2.524820
3 three A bar 0.574524 0.245856
4 one B bar 0.829092 0.172843
5 one C bar 1.191064 -0.697159
6 two A foo 1.270573 0.385478
7 three B foo -1.950049 1.068935
8 one C foo -0.617827 0.609768
9 one A bar -0.684763 0.722808
10 two B bar 2.060348 2.418646
11 three C bar -1.293355 1.708552
C bar foo
A B
one A -0.684763 -0.680790
B 0.829092 0.602496
C 1.191064 -0.617827
three A 0.574524 NaN
B NaN -1.950049
C -1.293355 NaN
two A NaN 1.270573
B 2.060348 NaN
C NaN 0.370922
Time Series#
Time Series 데이터를 다룰 때 필요한 편리한 기능을 제공합니다.
상세참조자료: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
import datetime
# 다양한 방법으로 DatetimeIndex 생성
dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), datetime.datetime(2018, 1, 1)])
print(dti)
dti = pd.date_range('2018-01-01', periods=2, freq='H')
print(dti)
# DateTimeIndex 로 변환
print(pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None])))
print(pd.to_datetime(['2005/11/23', '2010.12.31']))
print(pd.to_datetime([1349720105, 1349806505], unit='s')) # Epoch Timestamp Sec
print(pd.to_datetime([1349720105100, 1349720105200], unit='ms')) # Epoch Timestamp Millisec
# DateTimeIndex 로 변환시 에러 처리
try:
pd.to_datetime(['2009/07/31', 'asd'], errors='raise')
except Exception as e:
print("ERROR: {}".format(e))
pd.to_datetime(['2009/07/31', 'asd'], errors='ignore')
# Timezone 설정
dti = dti.tz_localize('UTC')
print(dti)
# Timezone 변경
dti = dti.tz_convert('Asia/Seoul')
print(dti)
DatetimeIndex(['2018-01-01', '2018-01-01', '2018-01-01'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 01:00:00'], dtype='datetime64[ns]', freq='H')
0 2009-07-31
1 2010-01-10
2 NaT
dtype: datetime64[ns]
DatetimeIndex(['2005-11-23', '2010-12-31'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2012-10-08 18:15:05.100000', '2012-10-08 18:15:05.200000'], dtype='datetime64[ns]', freq=None)
ERROR: Unknown string format: asd
DatetimeIndex(['2018-01-01 00:00:00+00:00', '2018-01-01 01:00:00+00:00'], dtype='datetime64[ns, UTC]', freq='H')
DatetimeIndex(['2018-01-01 09:00:00+09:00', '2018-01-01 10:00:00+09:00'], dtype='datetime64[ns, Asia/Seoul]', freq='H')
# 시간단위 샘플링
idx = pd.date_range('2018-01-01', periods=5, freq='D')
ts = pd.Series(range(len(idx)), index=idx)
print(ts)
print(ts.resample('8H').mean())
print(ts.resample('2D').mean())
2018-01-01 0
2018-01-02 1
2018-01-03 2
2018-01-04 3
2018-01-05 4
Freq: D, dtype: int64
2018-01-01 00:00:00 0.0
2018-01-01 08:00:00 NaN
2018-01-01 16:00:00 NaN
2018-01-02 00:00:00 1.0
2018-01-02 08:00:00 NaN
2018-01-02 16:00:00 NaN
2018-01-03 00:00:00 2.0
2018-01-03 08:00:00 NaN
2018-01-03 16:00:00 NaN
2018-01-04 00:00:00 3.0
2018-01-04 08:00:00 NaN
2018-01-04 16:00:00 NaN
2018-01-05 00:00:00 4.0
Freq: 8H, dtype: float64
2018-01-01 0.5
2018-01-03 2.5
2018-01-05 4.0
Freq: 2D, dtype: float64
## Version 0.24.1 이상
print(pd.__version__)
if pd.__version__ >= "0.24.1":
# 시간 계산
friday = pd.Timestamp('2018-01-05')
print(friday.day_name())
# Add 1 day
saturday = friday + pd.Timedelta('1 day')
print(saturday.day_name())
# Add 1 business day (Friday --> Monday)
monday = friday + pd.offsets.BDay()
print(monday.day_name())
0.24.1
Friday
Saturday
Monday