Pandas Cheatsheet#

Pandas 는 테이블 형태의 자료를 쉽고 직관적으로 다룰 수 있는 Python 라이브러리 입니다. R, Spark 과 마찬가지로 DataFrame 형태의 데이터 조작 방식을 제공하기 때문에 기존에 다른 언어에서의 DataFrame 형태의 API를 사용했던 사람이라면 쉽게 적응하여 사용할 수 있습니다.

참조

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("numpy ver={}".format(np.__version__))
print("pandas ver={}".format(pd.__version__))
numpy ver=1.23.1
pandas ver=1.2.4

Series, DataFrame#

  • Series: 1차원 데이터(리스트 형태)를 표현하는 구조

  • DataFrame: 2차원 데이터(테이블 형태)를 표현하는 구조

# Series
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

# DataFrame by Numpy Array
index = pd.date_range('20130101', periods=6)
print(index)

columns = ['A','B','C','D']
print(columns)

data = np.random.randn(6, 4)
print(data)

df = pd.DataFrame(data, index=index, columns=columns)
print(df)

# DataFrome by Dictonary
df2 = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'
})
print(df2)
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
['A', 'B', 'C', 'D']
[[-1.71938116  1.28631837  0.24284392 -0.93244523]
 [-1.19110208 -0.35500583  0.82614074 -0.08154429]
 [-0.24377431  0.63717415 -0.63062627 -0.08026694]
 [-0.70823093  1.7716175   0.1022816   1.04184055]
 [-1.04917555 -0.43138715 -0.75327522  0.31933443]
 [-0.81170268 -1.28165698  1.15325045 -0.43975833]]
                   A         B         C         D
2013-01-01 -1.719381  1.286318  0.242844 -0.932445
2013-01-02 -1.191102 -0.355006  0.826141 -0.081544
2013-01-03 -0.243774  0.637174 -0.630626 -0.080267
2013-01-04 -0.708231  1.771618  0.102282  1.041841
2013-01-05 -1.049176 -0.431387 -0.753275  0.319334
2013-01-06 -0.811703 -1.281657  1.153250 -0.439758
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo

CSV Read/Write#

# CSV
df = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'
})
print(df)
path_csv = "/tmp/foo.csv"
df.to_csv(path_csv, index=False)

df = pd.read_csv(path_csv, header=0)
print(df)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
     A           B    C  D      E    F
0  1.0  2013-01-02  1.0  3   test  foo
1  1.0  2013-01-02  1.0  3  train  foo
2  1.0  2013-01-02  1.0  3   test  foo
3  1.0  2013-01-02  1.0  3  train  foo

Excel Read/Write#

# Excel
# openpyxl 패키지 설치 필요
# pip install openpyxl 
df = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'
})
print(df)
path_csv = "/tmp/foo.xlsx"
df.to_excel(path_csv, index=False)

df = pd.read_excel(path_csv, header=0)
print(df)
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
   A          B  C  D      E    F
0  1 2013-01-02  1  3   test  foo
1  1 2013-01-02  1  3  train  foo
2  1 2013-01-02  1  3   test  foo
3  1 2013-01-02  1  3  train  foo

JSON Read/Write#

df = pd.DataFrame([['a', 'b'], ['c', 'd']], index=['row 1', 'row 2'], columns=['col 1', 'col 2'])
print(df)

path_json = "/tmp/foo.json"
# '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
#df.to_json(path_json, orient='index')
# '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
df.to_json(path_json, orient='records')

#df = pd.read_json(path_json, orient='index')
df = pd.read_json(path_json, orient='records')
print(df)
      col 1 col 2
row 1     a     b
row 2     c     d
  col 1 col 2
0     a     b
1     c     d

DataFrame 정보보기#

df = pd.DataFrame(data, index=index, columns=columns)
print(df)

# DataFrame 컬럼 정보
print(df2.dtypes)
print(df2.info())
                   A         B         C         D
2013-01-01 -1.719381  1.286318  0.242844 -0.932445
2013-01-02 -1.191102 -0.355006  0.826141 -0.081544
2013-01-03 -0.243774  0.637174 -0.630626 -0.080267
2013-01-04 -0.708231  1.771618  0.102282  1.041841
2013-01-05 -1.049176 -0.431387 -0.753275  0.319334
2013-01-06 -0.811703 -1.281657  1.153250 -0.439758
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       4 non-null      float64       
 1   B       4 non-null      datetime64[ns]
 2   C       4 non-null      float32       
 3   D       4 non-null      int32         
 4   E       4 non-null      category      
 5   F       4 non-null      object        
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes
None
# 데이터 살펴보기
print(df.head())
print(df.tail(3))

# DataFrame 요소 살펴보기
print(df.shape)
print(df.index)
print(df.columns)
print(df.values)
print(type(df.values))
                   A         B         C         D
2013-01-01 -1.719381  1.286318  0.242844 -0.932445
2013-01-02 -1.191102 -0.355006  0.826141 -0.081544
2013-01-03 -0.243774  0.637174 -0.630626 -0.080267
2013-01-04 -0.708231  1.771618  0.102282  1.041841
2013-01-05 -1.049176 -0.431387 -0.753275  0.319334
                   A         B         C         D
2013-01-04 -0.708231  1.771618  0.102282  1.041841
2013-01-05 -1.049176 -0.431387 -0.753275  0.319334
2013-01-06 -0.811703 -1.281657  1.153250 -0.439758
(6, 4)
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')
[[-1.71938116  1.28631837  0.24284392 -0.93244523]
 [-1.19110208 -0.35500583  0.82614074 -0.08154429]
 [-0.24377431  0.63717415 -0.63062627 -0.08026694]
 [-0.70823093  1.7716175   0.1022816   1.04184055]
 [-1.04917555 -0.43138715 -0.75327522  0.31933443]
 [-0.81170268 -1.28165698  1.15325045 -0.43975833]]
<class 'numpy.ndarray'>

DataFrame 조작하기#

Copy#

# Dataframe 은 주소 참조를 하기 때문에 원본 데이터가 수정되지 않기를 바란다면
# copy() 함수를 통해 명시적 복사를 진행해야 함
df2 = df.copy()

# 복사 후 달라진 주소 확인
print(f"df addr={hex(id(df))}, df2 addr={hex(id(df2))}")
df addr=0x125e17820, df2 addr=0x125dbf9d0

Selection#

# 행 슬라이싱
print(df[0:2])

# 열 선택
print(df['A'].head())

# 열 다중선택
print(df[['A', 'B']].head())

# 데이터 타입으로 열 선택
df3 = pd.DataFrame({'a': [1, 2] * 3,
                   'b': [True, False] * 3,
                   'c': [1.0, 2.0] * 3})
print(df3.select_dtypes(include=['bool']))
print(df3.select_dtypes(include=['float64']))
print(df3.select_dtypes(include=['int64']))
                   A         B         C         D
2013-01-01 -1.719381  1.286318  0.242844 -0.932445
2013-01-02 -1.191102 -0.355006  0.826141 -0.081544
2013-01-01   -1.719381
2013-01-02   -1.191102
2013-01-03   -0.243774
2013-01-04   -0.708231
2013-01-05   -1.049176
Freq: D, Name: A, dtype: float64
                   A         B
2013-01-01 -1.719381  1.286318
2013-01-02 -1.191102 -0.355006
2013-01-03 -0.243774  0.637174
2013-01-04 -0.708231  1.771618
2013-01-05 -1.049176 -0.431387
       b
0   True
1  False
2   True
3  False
4   True
5  False
     c
0  1.0
1  2.0
2  1.0
3  2.0
4  1.0
5  2.0
   a
0  1
1  2
2  1
3  2
4  1
5  2
# Label 을 이용한 선택 .loc()

# 셀 선택
print(df.loc['2013-01-01', 'A'])

# 행 선택
print(df.loc['2013-01-01'])

# 행 슬라이싱
print(df.loc['2013-01-01':'2013-01-03'])

# 열 선택
print(df.loc[:, 'A'].head())

# 열 다중 선택
print(df.loc[:, ['A', 'B']].head())

# 행, 열 다중 선택
print(df.loc['2013-01-01':'2013-01-03', ['A', 'B']])
-0.4065751958736635
A   -0.406575
B    1.154904
C    0.283025
D   -0.533548
Name: 2013-01-01 00:00:00, dtype: float64
                   A         B         C         D
2013-01-01 -0.406575  1.154904  0.283025 -0.533548
2013-01-02  0.310146 -0.977560  0.045798  1.600883
2013-01-03 -1.454818 -0.212861  1.080810 -0.198280
2013-01-01   -0.406575
2013-01-02    0.310146
2013-01-03   -1.454818
2013-01-04   -1.109016
2013-01-05   -0.183504
Freq: D, Name: A, dtype: float64
                   A         B
2013-01-01 -0.406575  1.154904
2013-01-02  0.310146 -0.977560
2013-01-03 -1.454818 -0.212861
2013-01-04 -1.109016 -1.420387
2013-01-05 -0.183504 -0.079999
                   A         B
2013-01-01 -0.406575  1.154904
2013-01-02  0.310146 -0.977560
2013-01-03 -1.454818 -0.212861
# Position을 이용한 선택 .iloc()

# 셀 선택
print(df.iloc[0, 0])

# 행 선택
print(df.iloc[0])

# 행 슬라이싱
print(df.iloc[0:3])

# 열 선택
print(df.iloc[:, 0].head())

# 열 다중 선택
print(df.iloc[:, 0:2].head())

# 행, 열 다중 선택
print(df.iloc[0:3, 0:2])

print(df.iloc[[1, 3], [0, 1]])
-0.4065751958736635
A   -0.406575
B    1.154904
C    0.283025
D   -0.533548
Name: 2013-01-01 00:00:00, dtype: float64
                   A         B         C         D
2013-01-01 -0.406575  1.154904  0.283025 -0.533548
2013-01-02  0.310146 -0.977560  0.045798  1.600883
2013-01-03 -1.454818 -0.212861  1.080810 -0.198280
2013-01-01   -0.406575
2013-01-02    0.310146
2013-01-03   -1.454818
2013-01-04   -1.109016
2013-01-05   -0.183504
Freq: D, Name: A, dtype: float64
                   A         B
2013-01-01 -0.406575  1.154904
2013-01-02  0.310146 -0.977560
2013-01-03 -1.454818 -0.212861
2013-01-04 -1.109016 -1.420387
2013-01-05 -0.183504 -0.079999
                   A         B
2013-01-01 -0.406575  1.154904
2013-01-02  0.310146 -0.977560
2013-01-03 -1.454818 -0.212861
                   A         B
2013-01-02  0.310146 -0.977560
2013-01-04 -1.109016 -1.420387
# 조건을 이용한 선택

# 열 단위
print(df[df['A'] > 0][['B', 'C']])
print(df.loc[df['A'] > 0, ['B', 'C']])

# DataFrame 단위
print(df[df > 0])

# Boolean 리스트를 이용
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
index = df2['E'].isin(['two','four'])
print(index)
print(df2[df2['E'].isin(['two','four'])])
print(df2.loc[df2['E'].isin(['two','four'])])
                  B         C
2013-01-02 -0.97756  0.045798
                  B         C
2013-01-02 -0.97756  0.045798
                   A         B         C         D
2013-01-01       NaN  1.154904  0.283025       NaN
2013-01-02  0.310146       NaN  0.045798  1.600883
2013-01-03       NaN       NaN  1.080810       NaN
2013-01-04       NaN       NaN       NaN       NaN
2013-01-05       NaN       NaN       NaN       NaN
2013-01-06       NaN       NaN       NaN  0.090299
2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool
                   A         B         C         D     E
2013-01-03 -1.454818 -0.212861  1.080810 -0.198280   two
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163  four
                   A         B         C         D     E
2013-01-03 -1.454818 -0.212861  1.080810 -0.198280   two
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163  four

Setting#

# 값 설정
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
df['F'] = s1
df['G'] = s1
print(df)

# 행 삭제: index 기준
df = df.drop([pd.to_datetime('2013-01-04')], axis=0) 
print(df)

# # 열 삭제: 열 이름기준
df = df.drop(['G'], axis=1)
print(df)


# Label에 의한 값 설정
df.at['2013-01-02', 'F'] = 0


# Position에 의한 값 설정
df.iat[2,-1] = 1

print(df)

# 배열을 이용한 값 설정
df.loc['2013-01-02':, 'F'] = np.array([5] * (len(df) - 1))
print(df)

# 조건을 통한 값 설정
df2 = df.copy()
df2[df2 > 0] = -df2
print(df2)
                   A         B         C         D    F    G
2013-01-01 -0.406575  1.154904  0.283025 -0.533548  NaN  NaN
2013-01-02  0.310146 -0.977560  0.045798  1.600883  1.0  1.0
2013-01-03 -1.454818 -0.212861  1.080810 -0.198280  2.0  2.0
2013-01-04 -1.109016 -1.420387 -0.340734 -0.438138  3.0  3.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163  4.0  4.0
2013-01-06 -0.133029 -0.770604 -2.005459  0.090299  5.0  5.0
                   A         B         C         D    F    G
2013-01-01 -0.406575  1.154904  0.283025 -0.533548  NaN  NaN
2013-01-02  0.310146 -0.977560  0.045798  1.600883  1.0  1.0
2013-01-03 -1.454818 -0.212861  1.080810 -0.198280  2.0  2.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163  4.0  4.0
2013-01-06 -0.133029 -0.770604 -2.005459  0.090299  5.0  5.0
                   A         B         C         D    F
2013-01-01 -0.406575  1.154904  0.283025 -0.533548  NaN
2013-01-02  0.310146 -0.977560  0.045798  1.600883  1.0
2013-01-03 -1.454818 -0.212861  1.080810 -0.198280  2.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163  4.0
2013-01-06 -0.133029 -0.770604 -2.005459  0.090299  5.0
                   A         B         C         D    F
2013-01-01 -0.406575  1.154904  0.283025 -0.533548  NaN
2013-01-02  0.310146 -0.977560  0.045798  1.600883  0.0
2013-01-03 -1.454818 -0.212861  1.080810 -0.198280  1.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163  4.0
2013-01-06 -0.133029 -0.770604 -2.005459  0.090299  5.0
                   A         B         C         D    F
2013-01-01 -0.406575  1.154904  0.283025 -0.533548  NaN
2013-01-02  0.310146 -0.977560  0.045798  1.600883  5.0
2013-01-03 -1.454818 -0.212861  1.080810 -0.198280  5.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163  5.0
2013-01-06 -0.133029 -0.770604 -2.005459  0.090299  5.0
                   A         B         C         D    F
2013-01-01 -0.406575 -1.154904 -0.283025 -0.533548  NaN
2013-01-02 -0.310146 -0.977560 -0.045798 -1.600883 -5.0
2013-01-03 -1.454818 -0.212861 -1.080810 -0.198280 -5.0
2013-01-05 -0.183504 -0.079999 -0.992413 -0.213163 -5.0
2013-01-06 -0.133029 -0.770604 -2.005459 -0.090299 -5.0

Missing Value#

index = pd.date_range('20130101', periods=6)
df1 = df.reindex(index=index[0:4], columns=list(df.columns) + ['E'])
df1.loc[index[0]:index[1],'E'] = 1
print(df1)

# 결측치 제거하기
print(df1.dropna(how='any'))

# 결측치 채워넣기
print(df1.fillna(value=5))

# 결측치 위치보기
print(df1.isna())
print(df1.isnull()) # isna() 와 같음

# 결측치는 제외한 위치보기
print(df1.notna())
print(df1.notnull())
                   A         B         C         D    F    E
2013-01-01  0.523658  0.548980 -0.968327  0.487522  NaN  1.0
2013-01-02  1.086258  0.904897 -0.446926 -0.504047  5.0  1.0
2013-01-03  0.236893 -0.094955 -0.280539 -0.162157  5.0  NaN
2013-01-04  0.438455  0.918198 -0.338696 -0.565120  5.0  NaN
                   A         B         C         D    F    E
2013-01-02  1.086258  0.904897 -0.446926 -0.504047  5.0  1.0
                   A         B         C         D    F    E
2013-01-01  0.523658  0.548980 -0.968327  0.487522  5.0  1.0
2013-01-02  1.086258  0.904897 -0.446926 -0.504047  5.0  1.0
2013-01-03  0.236893 -0.094955 -0.280539 -0.162157  5.0  5.0
2013-01-04  0.438455  0.918198 -0.338696 -0.565120  5.0  5.0
                A      B      C      D      F      E
2013-01-01  False  False  False  False   True  False
2013-01-02  False  False  False  False  False  False
2013-01-03  False  False  False  False  False   True
2013-01-04  False  False  False  False  False   True
                A      B      C      D      F      E
2013-01-01  False  False  False  False   True  False
2013-01-02  False  False  False  False  False  False
2013-01-03  False  False  False  False  False   True
2013-01-04  False  False  False  False  False   True
               A     B     C     D      F      E
2013-01-01  True  True  True  True  False   True
2013-01-02  True  True  True  True   True   True
2013-01-03  True  True  True  True   True  False
2013-01-04  True  True  True  True   True  False
               A     B     C     D      F      E
2013-01-01  True  True  True  True  False   True
2013-01-02  True  True  True  True   True   True
2013-01-03  True  True  True  True   True  False
2013-01-04  True  True  True  True   True  False

Operation#

# DataFrame 통계확인
print(df.describe())

# 열 별 통계
print(df.mean())

# 행 별 통계
print(df.mean(1))

# DataFrame 연산
s = pd.Series([1,3,5,np.nan,6,8], index=index).shift(2)
print(s)
print(df.sub(s, axis='index'))
              A         B         C         D    F
count  6.000000  6.000000  6.000000  6.000000  5.0
mean   0.332269  0.574651 -0.414428 -0.299727  5.0
std    0.507378  0.623868  0.296350  0.425299  0.0
min   -0.465602 -0.211383 -0.968327 -0.685344  5.0
25%    0.189688  0.066029 -0.425532 -0.549852  5.0
50%    0.337674  0.726939 -0.350023 -0.436632  5.0
75%    0.502357  0.914873 -0.295078 -0.213922  5.0
max    1.086258  1.382171 -0.090733  0.487522  5.0
A    0.332269
B    0.574651
C   -0.414428
D   -0.299727
F    5.000000
dtype: float64
2013-01-01    0.147958
2013-01-02    1.208036
2013-01-03    0.939848
2013-01-04    1.090567
2013-01-05    1.165111
2013-01-06    0.709387
Freq: D, dtype: float64
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64
                   A         B         C         D    F
2013-01-01       NaN       NaN       NaN       NaN  NaN
2013-01-02       NaN       NaN       NaN       NaN  NaN
2013-01-03 -0.763107 -1.094955 -1.280539 -1.162157  4.0
2013-01-04 -2.561545 -2.081802 -3.338696 -3.565120  2.0
2013-01-05 -4.826047 -3.617829 -5.361350 -5.369217  0.0
2013-01-06       NaN       NaN       NaN       NaN  NaN
# DataFrame Transpose(전치, 열과행치환)
print(df.T)
   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A    0.523658    1.086258    0.236893    0.438455    0.173953   -0.465602
B    0.548980    0.904897   -0.094955    0.918198    1.382171   -0.211383
C   -0.968327   -0.446926   -0.280539   -0.338696   -0.361350   -0.090733
D    0.487522   -0.504047   -0.162157   -0.565120   -0.369217   -0.685344
F         NaN    5.000000    5.000000    5.000000    5.000000    5.000000
# 축별로 정렬
print(df.sort_index()) # 오름차순
print(df.sort_index(axis=1, ascending=False)) #내림차순

# 값별로 정렬
print(df.sort_values(by='B'))
print(df.sort_values(by=['B', 'C'])) # 복수 컬럼
                   A         B         C         D    F
2013-01-01  0.523658  0.548980 -0.968327  0.487522  NaN
2013-01-02  1.086258  0.904897 -0.446926 -0.504047  5.0
2013-01-03  0.236893 -0.094955 -0.280539 -0.162157  5.0
2013-01-04  0.438455  0.918198 -0.338696 -0.565120  5.0
2013-01-05  0.173953  1.382171 -0.361350 -0.369217  5.0
2013-01-06 -0.465602 -0.211383 -0.090733 -0.685344  5.0
              F         D         C         B         A
2013-01-01  NaN  0.487522 -0.968327  0.548980  0.523658
2013-01-02  5.0 -0.504047 -0.446926  0.904897  1.086258
2013-01-03  5.0 -0.162157 -0.280539 -0.094955  0.236893
2013-01-04  5.0 -0.565120 -0.338696  0.918198  0.438455
2013-01-05  5.0 -0.369217 -0.361350  1.382171  0.173953
2013-01-06  5.0 -0.685344 -0.090733 -0.211383 -0.465602
                   A         B         C         D    F
2013-01-06 -0.465602 -0.211383 -0.090733 -0.685344  5.0
2013-01-03  0.236893 -0.094955 -0.280539 -0.162157  5.0
2013-01-01  0.523658  0.548980 -0.968327  0.487522  NaN
2013-01-02  1.086258  0.904897 -0.446926 -0.504047  5.0
2013-01-04  0.438455  0.918198 -0.338696 -0.565120  5.0
2013-01-05  0.173953  1.382171 -0.361350 -0.369217  5.0
                   A         B         C         D    F
2013-01-06 -0.465602 -0.211383 -0.090733 -0.685344  5.0
2013-01-03  0.236893 -0.094955 -0.280539 -0.162157  5.0
2013-01-01  0.523658  0.548980 -0.968327  0.487522  NaN
2013-01-02  1.086258  0.904897 -0.446926 -0.504047  5.0
2013-01-04  0.438455  0.918198 -0.338696 -0.565120  5.0
2013-01-05  0.173953  1.382171 -0.361350 -0.369217  5.0
# apply: 데이터에 함수 적용

# 열별 적용
print(df.apply(np.cumsum))
print(df.apply(lambda x: x.max() - x.min()))

# 행별 적용
print(df.apply(np.cumsum, axis=1))
print(df.apply(lambda x: x.max() - x.min(), axis=1))
                   A         B         C         D     F
2013-01-01  0.523658  0.548980 -0.968327  0.487522   NaN
2013-01-02  1.609916  1.453877 -1.415253 -0.016525   5.0
2013-01-03  1.846809  1.358922 -1.695792 -0.178682  10.0
2013-01-04  2.285264  2.277120 -2.034487 -0.743803  15.0
2013-01-05  2.459216  3.659291 -2.395837 -1.113020  20.0
2013-01-06  1.993614  3.447907 -2.486570 -1.798364  25.0
A    1.551860
B    1.593554
C    0.877594
D    1.172866
F    0.000000
dtype: float64
                   A         B         C         D         F
2013-01-01  0.523658  1.072638  0.104311  0.591833       NaN
2013-01-02  1.086258  1.991155  1.544229  1.040182  6.040182
2013-01-03  0.236893  0.141937 -0.138601 -0.300759  4.699241
2013-01-04  0.438455  1.356653  1.017957  0.452837  5.452837
2013-01-05  0.173953  1.556124  1.194774  0.825556  5.825556
2013-01-06 -0.465602 -0.676985 -0.767718 -1.453063  3.546937
2013-01-01    1.517308
2013-01-02    5.504047
2013-01-03    5.280539
2013-01-04    5.565120
2013-01-05    5.369217
2013-01-06    5.685344
Freq: D, dtype: float64
# Histogramming: 데이터 값 세기
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
print(s.value_counts())
0    6
1    6
2    3
3    5
4    3
5    6
6    3
7    6
8    0
9    2
dtype: int64
6    4
3    3
0    1
2    1
5    1
dtype: int64
# 문자열 함수
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s.str.lower())
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

Merge#

상세 참조자료: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

# Concat: 단순히 두 DataFrame을 연결만 하기 때문에 의도한 결과와 같은지 확인이 필요합니다.
df = pd.DataFrame(np.random.randn(10, 3))
a = df[:2]
b = df[3:5]
print(a)
print(b)

# 행 기반 연결
print(pd.concat([a, b]))
print(pd.concat([a, b], ignore_index=True))

# 열 기반 연결
b.columns = [3, 4, 5]
b.reset_index(inplace=True, drop=True)
print(pd.concat([a, b], axis=1))
          0         1         2
0  1.312877  0.573515 -0.834520
1 -0.695523 -0.110469  0.302099
          0         1         2
3 -0.264594 -1.256058 -0.873582
4  0.836012  1.234542  0.567087
          0         1         2
0  1.312877  0.573515 -0.834520
1 -0.695523 -0.110469  0.302099
3 -0.264594 -1.256058 -0.873582
4  0.836012  1.234542  0.567087
          0         1         2
0  1.312877  0.573515 -0.834520
1 -0.695523 -0.110469  0.302099
2 -0.264594 -1.256058 -0.873582
3  0.836012  1.234542  0.567087
          0         1         2         3         4         5
0  1.312877  0.573515 -0.834520 -0.264594 -1.256058 -0.873582
1 -0.695523 -0.110469  0.302099  0.836012  1.234542  0.567087
# Append: 행 기반 concat 과 같습니다.
df = pd.DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C'])
print(df)

s = df.iloc[1]
print(df.append(s, ignore_index=True))
          A         B         C
0  0.645724 -0.023637  0.529198
1 -1.660055 -0.120990 -0.693897
2 -0.368514  0.095203  0.262218
          A         B         C
0  0.645724 -0.023637  0.529198
1 -1.660055 -0.120990 -0.693897
2 -0.368514  0.095203  0.262218
3 -1.660055 -0.120990 -0.693897
# Join: SQL 에서 지원하는 방식과 유사
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(left)
print(right)
print(pd.merge(left, right, on= 'key'))
   key  lval
0  foo     1
1  bar     2
   key  rval
0  foo     4
1  bar     5
   key  lval  rval
0  foo     1     4
1  bar     2     5
# GroupBy: SQL 에서 지원되는 방식과 유사
df = pd.DataFrame({
  'A' : ['foo', 'bar', 'foo', 'bar'],
  'B' : ['one', 'one', 'two', 'three'],
  'C' : np.random.randn(4),
  'D' : np.random.randn(4)
})
print(df)

print(df.groupby('A').sum())
print(df.groupby(['A','B']).sum())
     A      B         C         D
0  foo    one -0.630498 -0.304689
1  bar    one  0.374077 -0.940406
2  foo    two -1.610434  0.304611
3  bar  three -0.281375 -0.808427
            C         D
A                      
bar  0.092702 -1.748833
foo -2.240932 -0.000078
                  C         D
A   B                        
bar one    0.374077 -0.940406
    three -0.281375 -0.808427
foo one   -0.630498 -0.304689
    two   -1.610434  0.304611

Reshaping#

# Stack
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
                                   ('two', 'a'), ('two', 'b')])
s = pd.Series(np.arange(1.0, 5.0), index=index)
print(s)
df = s.unstack()
print(df)
print(df.stack())

# Pivot
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
print(df)
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))
one  a    1.0
     b    2.0
two  a    3.0
     b    4.0
dtype: float64
       a    b
one  1.0  2.0
two  3.0  4.0
one  a    1.0
     b    2.0
two  a    3.0
     b    4.0
dtype: float64
        A  B    C         D         E
0     one  A  foo -0.680790 -0.787647
1     one  B  foo  0.602496  0.539262
2     two  C  foo  0.370922  2.524820
3   three  A  bar  0.574524  0.245856
4     one  B  bar  0.829092  0.172843
5     one  C  bar  1.191064 -0.697159
6     two  A  foo  1.270573  0.385478
7   three  B  foo -1.950049  1.068935
8     one  C  foo -0.617827  0.609768
9     one  A  bar -0.684763  0.722808
10    two  B  bar  2.060348  2.418646
11  three  C  bar -1.293355  1.708552
C             bar       foo
A     B                    
one   A -0.684763 -0.680790
      B  0.829092  0.602496
      C  1.191064 -0.617827
three A  0.574524       NaN
      B       NaN -1.950049
      C -1.293355       NaN
two   A       NaN  1.270573
      B  2.060348       NaN
      C       NaN  0.370922

Time Series#

Time Series 데이터를 다룰 때 필요한 편리한 기능을 제공합니다.

상세참조자료: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

import datetime

# 다양한 방법으로 DatetimeIndex 생성
dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), datetime.datetime(2018, 1, 1)])
print(dti)

dti = pd.date_range('2018-01-01', periods=2, freq='H')
print(dti)

# DateTimeIndex 로 변환
print(pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None])))
print(pd.to_datetime(['2005/11/23', '2010.12.31']))
print(pd.to_datetime([1349720105, 1349806505], unit='s')) # Epoch Timestamp Sec
print(pd.to_datetime([1349720105100, 1349720105200], unit='ms')) # Epoch Timestamp Millisec

# DateTimeIndex 로 변환시 에러 처리
try:
    pd.to_datetime(['2009/07/31', 'asd'], errors='raise')
except Exception as e:
    print("ERROR: {}".format(e))
pd.to_datetime(['2009/07/31', 'asd'], errors='ignore')
  
# Timezone 설정
dti = dti.tz_localize('UTC')
print(dti)

# Timezone 변경
dti = dti.tz_convert('Asia/Seoul')
print(dti)
DatetimeIndex(['2018-01-01', '2018-01-01', '2018-01-01'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 01:00:00'], dtype='datetime64[ns]', freq='H')
0   2009-07-31
1   2010-01-10
2          NaT
dtype: datetime64[ns]
DatetimeIndex(['2005-11-23', '2010-12-31'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2012-10-08 18:15:05.100000', '2012-10-08 18:15:05.200000'], dtype='datetime64[ns]', freq=None)
ERROR: Unknown string format: asd
DatetimeIndex(['2018-01-01 00:00:00+00:00', '2018-01-01 01:00:00+00:00'], dtype='datetime64[ns, UTC]', freq='H')
DatetimeIndex(['2018-01-01 09:00:00+09:00', '2018-01-01 10:00:00+09:00'], dtype='datetime64[ns, Asia/Seoul]', freq='H')
# 시간단위 샘플링
idx = pd.date_range('2018-01-01', periods=5, freq='D')
ts = pd.Series(range(len(idx)), index=idx)
print(ts)
print(ts.resample('8H').mean())
print(ts.resample('2D').mean())
2018-01-01    0
2018-01-02    1
2018-01-03    2
2018-01-04    3
2018-01-05    4
Freq: D, dtype: int64
2018-01-01 00:00:00    0.0
2018-01-01 08:00:00    NaN
2018-01-01 16:00:00    NaN
2018-01-02 00:00:00    1.0
2018-01-02 08:00:00    NaN
2018-01-02 16:00:00    NaN
2018-01-03 00:00:00    2.0
2018-01-03 08:00:00    NaN
2018-01-03 16:00:00    NaN
2018-01-04 00:00:00    3.0
2018-01-04 08:00:00    NaN
2018-01-04 16:00:00    NaN
2018-01-05 00:00:00    4.0
Freq: 8H, dtype: float64
2018-01-01    0.5
2018-01-03    2.5
2018-01-05    4.0
Freq: 2D, dtype: float64
## Version 0.24.1 이상
print(pd.__version__)
if pd.__version__ >= "0.24.1":
    # 시간 계산
    friday = pd.Timestamp('2018-01-05')
    print(friday.day_name())

    # Add 1 day
    saturday = friday + pd.Timedelta('1 day')
    print(saturday.day_name())

    # Add 1 business day (Friday --> Monday)
    monday = friday + pd.offsets.BDay()
    print(monday.day_name())
0.24.1
Friday
Saturday
Monday