1. AirQualityUCI 데이터셋¶
In [155]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [156]:
air_df = pd.read_csv('/content/drive/MyDrive/KDT/6. 머신러닝과 딥러닝/Data/AirQualityUCI.csv')
In [157]:
air_df
Out[157]:
Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | Unnamed: 15 | Unnamed: 16 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10-03-2004 | 18:00:00 | 2.6 | 1360.0 | 150.0 | 11.9 | 1046.0 | 166.0 | 1056.0 | 113.0 | 1692.0 | 1268.0 | 13.6 | 48.9 | 0.7578 | NaN | NaN |
1 | 10-03-2004 | 19:00:00 | 2.0 | 1292.0 | 112.0 | 9.4 | 955.0 | 103.0 | 1174.0 | 92.0 | 1559.0 | 972.0 | 13.3 | 47.7 | 0.7255 | NaN | NaN |
2 | 10-03-2004 | 20:00:00 | 2.2 | 1402.0 | 88.0 | 9.0 | 939.0 | 131.0 | 1140.0 | 114.0 | 1555.0 | 1074.0 | 11.9 | 54.0 | 0.7502 | NaN | NaN |
3 | 10-03-2004 | 21:00:00 | 2.2 | 1376.0 | 80.0 | 9.2 | 948.0 | 172.0 | 1092.0 | 122.0 | 1584.0 | 1203.0 | 11.0 | 60.0 | 0.7867 | NaN | NaN |
4 | 10-03-2004 | 22:00:00 | 1.6 | 1272.0 | 51.0 | 6.5 | 836.0 | 131.0 | 1205.0 | 116.0 | 1490.0 | 1110.0 | 11.2 | 59.6 | 0.7888 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9466 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9467 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9468 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9469 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9470 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9471 rows × 17 columns
In [158]:
air_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9471 entries, 0 to 9470 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 9357 non-null object 1 Time 9357 non-null object 2 CO(GT) 9357 non-null float64 3 PT08.S1(CO) 9357 non-null float64 4 NMHC(GT) 9357 non-null float64 5 C6H6(GT) 9357 non-null float64 6 PT08.S2(NMHC) 9357 non-null float64 7 NOx(GT) 9357 non-null float64 8 PT08.S3(NOx) 9357 non-null float64 9 NO2(GT) 9357 non-null float64 10 PT08.S4(NO2) 9357 non-null float64 11 PT08.S5(O3) 9357 non-null float64 12 T 9357 non-null float64 13 RH 9357 non-null float64 14 AH 9357 non-null float64 15 Unnamed: 15 0 non-null float64 16 Unnamed: 16 0 non-null float64 dtypes: float64(15), object(2) memory usage: 1.2+ MB
- Date: 측정 날짜
- Time: 측정 시간
- CO(GT): 일산화탄소 농도 (mg/m^3)
- PT08.S1(CO): 일산화탄소에 대한 센서 응답
- NMHC(GT): 비메탄 탄화수소 농도 (microg/m^3)
- C6H6(GT): 벤젠 농도 (microg/m^3)
- PT08.S2(NMHC): 탄화수소에 대한 센서 응답
- NOx(GT): 산화 질소 농도 (ppb)
- PT08.S3(NOx): 산화 질소에 대한 센서 응답
- NO2(GT): 이산화질소 농도 (microg/m^3)
- PT08.S4(NO2): 이산화질소에 대한 센서 응답
- PT08.S5(O3): 오존에 대한 센서 응답
- T: 온도 (°C)
- RH: 상대 습도 (%)
- AH: 절대 습도 (g/m^3)
In [159]:
air_df.drop(['Unnamed: 15', 'Unnamed: 16'], axis=1, inplace=True)
In [160]:
air_df.dropna(inplace=True)
In [161]:
air_df.info()
<class 'pandas.core.frame.DataFrame'> Index: 9357 entries, 0 to 9356 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 9357 non-null object 1 Time 9357 non-null object 2 CO(GT) 9357 non-null float64 3 PT08.S1(CO) 9357 non-null float64 4 NMHC(GT) 9357 non-null float64 5 C6H6(GT) 9357 non-null float64 6 PT08.S2(NMHC) 9357 non-null float64 7 NOx(GT) 9357 non-null float64 8 PT08.S3(NOx) 9357 non-null float64 9 NO2(GT) 9357 non-null float64 10 PT08.S4(NO2) 9357 non-null float64 11 PT08.S5(O3) 9357 non-null float64 12 T 9357 non-null float64 13 RH 9357 non-null float64 14 AH 9357 non-null float64 dtypes: float64(13), object(2) memory usage: 1.1+ MB
In [162]:
# Date 컬럼을 datetime 으로 변경
air_df['Date'] = pd.to_datetime(air_df.Date, format='%d-%m-%Y')
In [163]:
# Date 컬럼에 의한 Month 파생변수를 생성
air_df['Month'] = air_df['Date'].dt.month
air_df.head()
Out[163]:
Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | Month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2004-03-10 | 18:00:00 | 2.6 | 1360.0 | 150.0 | 11.9 | 1046.0 | 166.0 | 1056.0 | 113.0 | 1692.0 | 1268.0 | 13.6 | 48.9 | 0.7578 | 3 |
1 | 2004-03-10 | 19:00:00 | 2.0 | 1292.0 | 112.0 | 9.4 | 955.0 | 103.0 | 1174.0 | 92.0 | 1559.0 | 972.0 | 13.3 | 47.7 | 0.7255 | 3 |
2 | 2004-03-10 | 20:00:00 | 2.2 | 1402.0 | 88.0 | 9.0 | 939.0 | 131.0 | 1140.0 | 114.0 | 1555.0 | 1074.0 | 11.9 | 54.0 | 0.7502 | 3 |
3 | 2004-03-10 | 21:00:00 | 2.2 | 1376.0 | 80.0 | 9.2 | 948.0 | 172.0 | 1092.0 | 122.0 | 1584.0 | 1203.0 | 11.0 | 60.0 | 0.7867 | 3 |
4 | 2004-03-10 | 22:00:00 | 1.6 | 1272.0 | 51.0 | 6.5 | 836.0 | 131.0 | 1205.0 | 116.0 | 1490.0 | 1110.0 | 11.2 | 59.6 | 0.7888 | 3 |
In [164]:
# .Time 컬럼에 의한 Hour 파생변수를 생성
air_df['Hour'] = air_df['Time'].str.split(':').str[0].fillna(0).astype(int)
air_df.head()
Out[164]:
Date | Time | CO(GT) | PT08.S1(CO) | NMHC(GT) | C6H6(GT) | PT08.S2(NMHC) | NOx(GT) | PT08.S3(NOx) | NO2(GT) | PT08.S4(NO2) | PT08.S5(O3) | T | RH | AH | Month | Hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2004-03-10 | 18:00:00 | 2.6 | 1360.0 | 150.0 | 11.9 | 1046.0 | 166.0 | 1056.0 | 113.0 | 1692.0 | 1268.0 | 13.6 | 48.9 | 0.7578 | 3 | 18 |
1 | 2004-03-10 | 19:00:00 | 2.0 | 1292.0 | 112.0 | 9.4 | 955.0 | 103.0 | 1174.0 | 92.0 | 1559.0 | 972.0 | 13.3 | 47.7 | 0.7255 | 3 | 19 |
2 | 2004-03-10 | 20:00:00 | 2.2 | 1402.0 | 88.0 | 9.0 | 939.0 | 131.0 | 1140.0 | 114.0 | 1555.0 | 1074.0 | 11.9 | 54.0 | 0.7502 | 3 | 20 |
3 | 2004-03-10 | 21:00:00 | 2.2 | 1376.0 | 80.0 | 9.2 | 948.0 | 172.0 | 1092.0 | 122.0 | 1584.0 | 1203.0 | 11.0 | 60.0 | 0.7867 | 3 | 21 |
4 | 2004-03-10 | 22:00:00 | 1.6 | 1272.0 | 51.0 | 6.5 | 836.0 | 131.0 | 1205.0 | 116.0 | 1490.0 | 1110.0 | 11.2 | 59.6 | 0.7888 | 3 | 22 |
In [165]:
air_df.drop(['Date', 'Time'], axis=1, inplace=True)
In [166]:
plt.figure(figsize=(12, 12))
sns.heatmap(air_df.corr(), cmap='coolwarm', vmin=1, vmax=1, annot=True)
Out[166]:
<Axes: >
In [167]:
# 종속변수(RH)를 제외한 모든 컬럼을 StandardScaler로 정규화
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
In [168]:
ss = StandardScaler()
In [169]:
X = air_df.drop('RH', axis=1)
y = air_df['RH']
In [170]:
Xss = ss.fit_transform(X)
In [171]:
Xss
Out[171]:
array([[ 0.4739999 , 0.94298276, 2.21123554, ..., 0.19488093, -0.96287933, 0.93913327], [ 0.46627322, 0.73680662, 1.93938293, ..., 0.19405218, -0.96287933, 1.08358325], [ 0.46884878, 1.07032685, 1.76768654, ..., 0.19468593, -0.96287933, 1.22803323], ..., [ 0.47142434, 0.28200632, -0.29267014, ..., 0.19187384, -0.6720105 , 0.0724334 ], [ 0.467561 , -0.13944196, -0.29267014, ..., 0.188623 , -0.6720105 , 0.21688338], [ 0.46884878, 0.06673418, -0.29267014, ..., 0.1883382 , -0.6720105 , 0.36133336]])
In [172]:
X_train, X_test, y_train, y_test = train_test_split(Xss, y, test_size=0.2, random_state=2024)
In [173]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[173]:
((7485, 14), (1872, 14), (7485,), (1872,))
2. 모델별 성능 확인하기¶
In [174]:
# MSE로 확인하기
# Linear Regression
# Decision Tree Regression
# Random Forest Regression
# Support Vector (Machine) Regression
# lightGBM Regression
# 어떤 모델이 현재 데이터에 가장 적합한가?
In [175]:
my_predictions = {}
colors = ['r', 'c', 'm', 'y', 'k', 'khaki', 'teal', 'orchid', 'sandybrown',
'greenyellow', 'dodgerblue', 'deepskyblue', 'rosybrown', 'firebrick',
'deeppink', 'crimson', 'salmon', 'darkred', 'olivedrab', 'olive',
'forestgreen', 'royalblue', 'indigo', 'navy', 'mediumpurple', 'chocolate',
'gold', 'darkorange', 'seagreen', 'turquoise', 'steelblue', 'slategray',
'peru', 'midnightblue', 'slateblue', 'dimgray', 'cadetblue', 'tomato']
In [176]:
def plot_predictions(name_, pred, actual):
df = pd.DataFrame({'prediction': pred, 'actual': y_test})
df = df.sort_values(by='actual').reset_index(drop=True)
plt.figure(figsize=(12, 9))
plt.scatter(df.index, df['prediction'], marker='x', color='r')
plt.scatter(df.index, df['actual'], alpha=0.7, marker='o', color='black')
plt.title(name_, fontsize=15)
plt.legend(['prediction', 'actual'], fontsize=12)
plt.show()
In [177]:
def mse_eval(name_, pred, actual):
global my_predictions
global colors
plot_predictions(name_, pred, actual)
mse = mean_squared_error(pred, actual)
my_predictions[name_] = mse
y_value = sorted(my_predictions.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame(y_value, columns=['model', 'mse'])
print(df)
min_ = df['mse'].min() - 10
max_ = df['mse'].max() + 10
length = len(df)
plt.figure(figsize=(10, length))
ax = plt.subplot()
ax.set_yticks(np.arange(len(df)))
ax.set_yticklabels(df['model'], fontsize=15)
bars = ax.barh(np.arange(len(df)), df['mse'])
for i, v in enumerate(df['mse']):
idx = np.random.choice(len(colors))
bars[i].set_color(colors[idx])
ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=15, fontweight='bold')
plt.title('MSE Error', fontsize=18)
plt.xlim(min_, max_)
plt.show()
2-1. Linear Regression¶
In [178]:
from sklearn.linear_model import LinearRegression
In [179]:
model = LinearRegression()
In [180]:
model.fit(X_train, y_train)
Out[180]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [181]:
pred1 = model.predict(X_test)
pred1
Out[181]:
array([23.8180522 , 57.47717849, 26.65575731, ..., 36.58528636, 57.15094573, 45.02385751])
In [182]:
rs1 = np.sqrt(mean_squared_error(y_test, pred1))
rs1
Out[182]:
7.140495660974789
In [183]:
mse_eval('LinearRegression', pred1, y_test)
model mse 0 LinearRegression 50.986678
2-2. Decision Tree Regression¶
In [184]:
from sklearn.tree import DecisionTreeRegressor
In [185]:
model2 = DecisionTreeRegressor()
In [186]:
model2.fit(X_train, y_train)
Out[186]:
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()
In [187]:
pred2 = model2.predict(X_test)
pred2
Out[187]:
array([26.3, 52.4, 29.3, ..., 21.3, 60.6, 41.6])
In [188]:
rs2 = np.sqrt(mean_squared_error(y_test, pred2))
rs2
Out[188]:
1.2203141671990319
In [189]:
mse_eval('DecisionTreeRegressor', pred2, y_test)
model mse 0 LinearRegression 50.986678 1 DecisionTreeRegressor 1.489167
2-3. Random Forest Regression¶
In [190]:
from sklearn.ensemble import RandomForestRegressor
In [191]:
model3 = RandomForestRegressor()
In [192]:
model3.fit(X_train, y_train)
Out[192]:
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor()
In [193]:
pred3 = model3.predict(X_test)
pred3
Out[193]:
array([24.852, 54.165, 28.643, ..., 23.135, 60.506, 41.155])
In [194]:
rs3 = np.sqrt(mean_squared_error(y_test, pred3))
rs3
Out[194]:
0.6169252208317394
In [195]:
mse_eval('RandomForestRegressor', pred3, y_test)
model mse 0 LinearRegression 50.986678 1 DecisionTreeRegressor 1.489167 2 RandomForestRegressor 0.380597
2-4. Support Vector Machine¶
In [196]:
from sklearn.svm import SVR
In [197]:
model4 = SVR()
In [198]:
model4.fit(X_train, y_train)
Out[198]:
SVR()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVR()
In [199]:
pred4 = model4.predict(X_test)
pred4
Out[199]:
array([23.7494879 , 54.74866106, 44.12666556, ..., 35.0783411 , 51.80026591, 52.39127057])
In [200]:
rs4 = np.sqrt(mean_squared_error(y_test, pred4))
rs4
Out[200]:
19.947041499162168
In [201]:
mse_eval('SVR', pred4, y_test)
model mse 0 SVR 397.884465 1 LinearRegression 50.986678 2 DecisionTreeRegressor 1.489167 3 RandomForestRegressor 0.380597
2-5. lightGBM¶
In [202]:
from lightgbm import LGBMRegressor
In [203]:
model5 = LGBMRegressor(random_state=2024)
In [204]:
model5.fit(X_train, y_train)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001957 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 2915 [LightGBM] [Info] Number of data points in the train set: 7485, number of used features: 14 [LightGBM] [Info] Start training from score 39.869138
Out[204]:
LGBMRegressor(random_state=2024)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMRegressor(random_state=2024)
In [205]:
pred5 = model5.predict(X_test)
pred5
Out[205]:
array([24.6114059 , 53.97319694, 27.61933669, ..., 21.85626631, 59.32846223, 42.34650828])
In [206]:
rs5 = np.sqrt(mean_squared_error(y_test, pred5))
rs5
Out[206]:
0.740171951644305
In [207]:
mse_eval('LGBMRegressor', pred5, y_test)
model mse 0 SVR 397.884465 1 LinearRegression 50.986678 2 DecisionTreeRegressor 1.489167 3 LGBMRegressor 0.547855 4 RandomForestRegressor 0.380597
In [208]:
dic = {'Linear Regression': rs1,
'Decision Tree Regressor': rs2,
'Random Forest Regressor': rs3,
'Support Vector Machine': rs4,
'lightGBM': rs5}
res = [key for key in dic if all(dic[temp] >= dic[key] for temp in dic)]
print(res)
min = {k: dic[k] for k in dic.keys() & set(res)}
print(min)
['Random Forest Regressor'] {'Random Forest Regressor': 0.6169252208317394}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)
models = {
"Linear Regression": LinearRegression(),
"Decision Tree": DecisionTreeRegressor(),
"Random Forest": RandomForestRegressor(),
"Gradient Boosting": GradientBoostingRegressor()
}
# Train and evaluate the models
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
results[name] = mse
results
In [ ]: