!wget "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

--2022-10-22 23:33:09--  http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data
Resolving archive.ics.uci.edu... 128.195.10.252
Connecting to archive.ics.uci.edu|128.195.10.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30286 (30K) [application/x-httpd-php]
Saving to: 'auto-mpg.data.1'

100%[======================================>] 30,286      --.-K/s   in 0.1s    

2022-10-22 23:33:10 (205 KB/s) - 'auto-mpg.data.1' saved [30286/30286]


!head -n 5 "auto-mpg.data"

18.0   8   307.0      130.0      3504.      12.0   70  1	"chevrolet chevelle malibu"
15.0   8   350.0      165.0      3693.      11.5   70  1	"buick skylark 320"
18.0   8   318.0      150.0      3436.      11.0   70  1	"plymouth satellite"
16.0   8   304.0      150.0      3433.      12.0   70  1	"amc rebel sst"
17.0   8   302.0      140.0      3449.      10.5   70  1	"ford torino"


with open("auto-mpg.data") as file_data:
    head = [next(file_data) for x in range(5)]
file_contents = "".join(head)
print(file_contents)

18.0   8   307.0      130.0      3504.      12.0   70  1	"chevrolet chevelle malibu"
15.0   8   350.0      165.0      3693.      11.5   70  1	"buick skylark 320"
18.0   8   318.0      150.0      3436.      11.0   70  1	"plymouth satellite"
16.0   8   304.0      150.0      3433.      12.0   70  1	"amc rebel sst"
17.0   8   302.0      140.0      3449.      10.5   70  1	"ford torino"


import pandas as pd
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight', 'Acceleration', 'Model Year', 'Origin']
df_raw = pd.read_csv("auto-mpg.data", names=column_names, na_values = "?", comment='\t', sep=" ", skipinitialspace=True)
df_raw.head()


df_raw.shape  # 크기보고

(398, 8)


df_raw.info() # 전체적으로 한번 보고 df_raw.describe()도 볼 수 있겠지만 일단 필요하기 전까진 패쓰~


RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB


df_raw.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64


df_data = df_raw.copy()
df_data.dropna(inplace=True)
df_data.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      0
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64


df_origin_ohe = pd.get_dummies(df_data['Origin'])
df_origin_ohe.rename(columns={1:"USA", 2:"Europe", 3:"JAPAN"}, inplace=True)
df_origin_ohe.head()


df_data = df_data.join(df_origin_ohe)
df_data = df_data.drop('Origin', axis=1)
df_data.head()


y_data = df_data.pop('MPG')  # 일단, Label은 빼내고요.


from sklearn.preprocessing import StandardScaler
cols_except = ['USA', 'Europe', 'JAPAN'] # 얘네만 빼고요~
cols = list(df_data.columns)
cols_to_norm = [col for col in cols if col not in cols_except]
df_data[cols_to_norm] = StandardScaler().fit_transform(df_data[cols_to_norm])
df_data


from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_data, y_data, test_size=0.2, random_state=777)
x_train


y_train # 레이블도 잘 되었나요....?

198    33.0
167    29.0
258    20.6
169    20.0
149    24.0
       ... 
72     15.0
88     14.0
60     20.0
305    28.4
104    12.0
Name: MPG, Length: 313, dtype: float64


import keras
from keras import layers

def build_model() :
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[len(x_train.keys())]),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
      ])

    optimizer = keras.optimizers.RMSprop(0.001)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mae', 'mse'])
    
    return model
    
model = build_model()
input_shape = x_train.shape  
model.build(input_shape)


model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_3 (Dense)              (None, 64)                640       
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
=================================================================
Total params: 4,865
Trainable params: 4,865
Non-trainable params: 0
_________________________________________________________________


example_batch = x_train[:10]
example_result = model.predict(example_batch)
example_result

array([[-0.44900858],
       [-0.36076534],
       [ 0.07221393],
       [-0.02241866],
       [-0.22445838],
       [-0.5100449 ],
       [-0.42133415],
       [-0.44038382],
       [-0.14608991],
       [-0.01318132]], dtype=float32)


# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

EPOCHS = 1000

history = model.fit(
  x_train, y_train,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[PrintDot()])

....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................


hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()


import matplotlib.pyplot as plt

def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure(figsize=(8,12))

  plt.subplot(2,1,1)
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mae'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
  plt.ylim([0,5])
  plt.legend()

  plt.subplot(2,1,2)
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MPG^2$]')
  plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
  plt.ylim([0,20])
  plt.legend()
  plt.show()

plot_history(history)


# patience 매개변수는 성능 향상을 체크할 에포크 횟수입니다
model = build_model()

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(x_train, y_train, epochs=EPOCHS,
                    validation_split = 0.2, verbose=0, callbacks=[early_stop, PrintDot()])

plot_history(history)

..................................................................


loss, mae, mse = model.evaluate(x_test, y_test, verbose=2)

print("테스트 세트의 평균 절대 오차: {:5.2f} MPG".format(mae))

3/3 - 0s - loss: 6.1177 - mae: 1.8922 - mse: 6.1177
테스트 세트의 평균 절대 오차:  1.89 MPG


test_predictions = model.predict(x_test).flatten()

plt.scatter(y_test, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])

	Cylinders	Displacement	Horsepower	Weight	Acceleration	Model Year	USA	Europe	JAPAN
0	1.483947	1.077290	0.664133	0.620540	-1.285258	-1.625315	1	0	0
1	1.483947	1.488732	1.574594	0.843334	-1.466724	-1.625315	1	0	0
2	1.483947	1.182542	1.184397	0.540382	-1.648189	-1.625315	1	0	0
3	1.483947	1.048584	1.184397	0.536845	-1.285258	-1.625315	1	0	0
4	1.483947	1.029447	0.924265	0.555706	-1.829655	-1.625315	1	0	0
...	...	...	...	...	...	...	...	...	...
393	-0.864014	-0.520637	-0.480448	-0.221125	0.021294	1.636410	1	0	0
394	-0.864014	-0.932079	-1.364896	-0.999134	3.287676	1.636410	0	1	0
395	-0.864014	-0.568479	-0.532474	-0.804632	-1.430430	1.636410	1	0	0
396	-0.864014	-0.712005	-0.662540	-0.415627	1.110088	1.636410	1	0	0
397	-0.864014	-0.721574	-0.584501	-0.303641	1.400433	1.636410	1	0	0

	Cylinders	Displacement	Horsepower	Weight	Acceleration	Model Year	USA	Europe	JAPAN
198	-0.864014	-0.989490	-1.338883	-1.394033	0.674571	0.005547	0	0	1
167	-0.864014	-0.932079	-0.766593	-0.950804	0.166467	-0.266263	0	0	1
258	0.309967	0.350090	0.013803	0.474369	0.093881	0.549168	1	0	0
169	0.309967	0.359658	-0.116263	-0.074953	0.166467	-0.266263	1	0	0
149	-0.864014	-0.712005	-0.194303	-0.575944	-0.196464	-0.538074	0	0	1
...	...	...	...	...	...	...	...	...	...
72	1.483947	1.048584	1.184397	1.077916	-1.103792	-1.081695	1	0	0
88	1.483947	1.029447	0.846225	1.254736	-0.377930	-0.809884	1	0	0
60	-0.864014	-0.520637	-0.376395	-0.671427	1.436726	-1.081695	1	0	0
305	-0.864014	-0.415384	-0.376395	-0.362581	0.166467	0.820978	1	0	0
104	1.483947	1.967153	1.626621	2.273222	-1.103792	-0.809884	1	0	0

	loss	mae	mse	val_loss	val_mae	val_mse	epoch
995	1.783556	0.851859	1.783556	9.792827	2.338215	9.792827	995
996	1.931622	0.917257	1.931622	10.240353	2.394897	10.240353	996
997	1.761907	0.852561	1.761907	9.956141	2.273148	9.956141	997
998	1.816791	0.874138	1.816791	10.000995	2.293658	10.000995	998
999	1.720585	0.842540	1.720585	10.126182	2.285089	10.126182	999

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

	MPG	Cylinders	Displacement	Horsepower	Weight	Acceleration	Model Year	Origin
0	18.0	8	307.0	130.0	3504.0	12.0	70	1
1	15.0	8	350.0	165.0	3693.0	11.5	70	1
2	18.0	8	318.0	150.0	3436.0	11.0	70	1
3	16.0	8	304.0	150.0	3433.0	12.0	70	1
4	17.0	8	302.0	140.0	3449.0	10.5	70	1

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`