본문 바로가기

Python

파이썬_데이터 전처리 (Encoding) Data Preprocessing, GET DUMMIES, ONE HOT ENCODING, LABEL ENCODING

 

 

 

 

In [1]:
###LABEL ENCODING###

from sklearn.preprocessing import LabelEncoder
items=['냉장고','전자레인지','컴퓨터','선풍기','믹서','믹서']
encoder=LabelEncoder()
encoder.fit(items)
labels=encoder.transform(items)
print(labels)
 
[0 3 4 2 1 1]
In [2]:
print(encoder.classes_)
 
['냉장고' '믹서' '선풍기' '전자레인지' '컴퓨터']
In [8]:
###ONE-HOT ENCODING###

from sklearn.preprocessing import OneHotEncoder
import numpy as np

items=['냉장고','전자레인지','컴퓨터','선풍기','믹서','믹서']
encoder=LabelEncoder()
encoder.fit(items)
labels=encoder.transform(items)
labels=labels.reshape(-1,1)

oh_encoder=OneHotEncoder()
oh_encoder.fit(labels)
oh_labels=oh_encoder.transform(labels)
oh_labels.toarray()
Out[8]:
array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])
In [10]:
###GET DUMMIES###

import pandas as pd
df=pd.DataFrame({'item':['냉장고','전자레인지','컴퓨터','선풍기','믹서','믹서']})
df.head()
Out[10]:
  item
0 냉장고
1 전자레인지
2 컴퓨터
3 선풍기
4 믹서
In [11]:
pd.get_dummies(df)
Out[11]:
  item_냉장고 item_믹서 item_선풍기 item_전자레인지 item_컴퓨터
0 1 0 0 0 0
1 0 0 0 1 0
2 0 0 0 0 1
3 0 0 1 0 0
4 0 1 0 0 0
5 0 1 0 0 0