!wget https://raw.githubusercontent.com/Datamanim/datarepo/main/bank/train.csv

--2023-07-18 21:15:36--  https://raw.githubusercontent.com/Datamanim/datarepo/main/bank/train.csv
raw.githubusercontent.com (raw.githubusercontent.com) 해석 중... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
다음으로 연결 중: raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... 연결했습니다.
HTTP 요청을 보냈습니다. 응답 기다리는 중... 200 OK
길이: 1085379 (1.0M) [text/plain]
저장 위치: `train.csv.6'

train.csv.6         100%[===================>]   1.03M  --.-KB/s    / 0.1s     

2023-07-18 21:15:37 (9.04 MB/s) - `train.csv.6' 저장함 [1085379/1085379]


import pandas as pd 

ixlsx = "train.csv"
df_raw = pd.read_csv(ixlsx)

df_raw.head()


df_raw.info()


RangeIndex: 12870 entries, 0 to 12869
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         12870 non-null  int64 
 1   age        12870 non-null  int64 
 2   job        12870 non-null  object
 3   marital    12870 non-null  object
 4   education  12870 non-null  object
 5   default    12870 non-null  object
 6   balance    12870 non-null  int64 
 7   housing    12870 non-null  object
 8   loan       12870 non-null  object
 9   contact    12870 non-null  object
 10  day        12870 non-null  int64 
 11  month      12870 non-null  object
 12  campaign   12870 non-null  int64 
 13  pdays      12870 non-null  int64 
 14  previous   12870 non-null  int64 
 15  poutcome   12870 non-null  object
 16  y          12870 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.7+ MB


import math

df_raw['age'].apply(lambda row : (math.floor(row/10)*10)).value_counts()

30    5056
40    3198
50    2244
20    1638
60     460
70     193
80      57
10      18
90       6
Name: age, dtype: int64


len(df_raw[(df_raw['age']>=25) & (df_raw['age']<29) & (df_raw['housing']=='yes')])

504


len(df_raw[(df_raw['age']>=25) & (df_raw['age']<29)])

1007


504/1007 # 전체 중 어느 정도?

0.5004965243296922


df_raw.info() # object들이 numeric하지 않은 Feature이다.


RangeIndex: 12870 entries, 0 to 12869
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         12870 non-null  int64 
 1   age        12870 non-null  int64 
 2   job        12870 non-null  object
 3   marital    12870 non-null  object
 4   education  12870 non-null  object
 5   default    12870 non-null  object
 6   balance    12870 non-null  int64 
 7   housing    12870 non-null  object
 8   loan       12870 non-null  object
 9   contact    12870 non-null  object
 10  day        12870 non-null  int64 
 11  month      12870 non-null  object
 12  campaign   12870 non-null  int64 
 13  pdays      12870 non-null  int64 
 14  previous   12870 non-null  int64 
 15  poutcome   12870 non-null  object
 16  y          12870 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.7+ MB


dic_cols = {}

for column in df_raw.columns:  # 각 컬럼을 돌면서 
    if df_raw[column].dtype!='int' :  # 셀수 있는 것이 아닌 것들의 
        dic_cols.update({column: df_raw[column].nunique()}) # unique값들을 세고
       
dft_cols = pd.DataFrame().from_dict(dic_cols, orient='index').sort_values(by=0, ascending=False) # 그걸 데이터프레임으로 만듦
display(dft_cols.head)

max_index = dft_cols[0].max()
dft_cols[dft_cols[0]==max_index] # 동률을 다 찾아서 출력


df_raw['job'].value_counts()

management       2858
blue-collar      2571
technician       2141
admin.           1464
services         1043
retired           770
self-employed     454
unemployed        414
entrepreneur      383
student           358
housemaid         334
unknown            80
Name: job, dtype: int64


df_raw[df_raw['balance']>=df_raw['balance'].mean()].sort_values(by='ID', ascending=False)[:100]['balance'].mean()

3473.73


dft_max = df_raw.groupby(['month', 'day'])['ID'].count().sort_values(ascending=False)
display(dft_max)

dft_max.index[0]

month  day
may    15     301
       14     283
       13     257
       7      239
nov    21     221
             ... 
mar    19       1
oct    4        1
dec    16       1
sep    5        1
dec    30       1
Name: ID, Length: 303, dtype: int64

('may', 15)


from scipy import stats

dft_job_unknown_age = df_raw[df_raw['job']=='unknown']['age'].copy()

shapiro_test = stats.shapiro(dft_job_unknown_age)

shapiro_test

ShapiroResult(statistic=0.9784717559814453, pvalue=0.1961131989955902)


dft_job_unknown_age.hist()


df_raw['age'].corr(df_raw['balance'])

0.10198734763981504


cross_table = pd.crosstab(df_raw['education'], df_raw['y'])
cross_table


from scipy.stats import chi2_contingency
chi2 , p ,dof, expected = chi2_contingency(cross_table)
p

7.901201277473551e-29


df_raw['marital'].value_counts()

married     7490
single      3905
divorced    1475
Name: marital, dtype: int64


pd.crosstab(df_raw['job'], df_raw['marital']) # 결과를 미리 본다면


dft_job_marital = df_raw.groupby(['job','marital']).size().reset_index()
dft_job_marital.head()


dft_jm = dft_job_marital.pivot_table(index='job', columns='marital', values=0).reset_index()
dft_jm.head()


dft_jm['ratio'] = dft_jm['divorced'] / dft_jm['married']
dft_jm.head()


dft_jm.sort_values(by='ratio', ascending=False)[:2]


df_raw.info()


RangeIndex: 12870 entries, 0 to 12869
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         12870 non-null  int64 
 1   age        12870 non-null  int64 
 2   job        12870 non-null  object
 3   marital    12870 non-null  object
 4   education  12870 non-null  object
 5   default    12870 non-null  object
 6   balance    12870 non-null  int64 
 7   housing    12870 non-null  object
 8   loan       12870 non-null  object
 9   contact    12870 non-null  object
 10  day        12870 non-null  int64 
 11  month      12870 non-null  object
 12  campaign   12870 non-null  int64 
 13  pdays      12870 non-null  int64 
 14  previous   12870 non-null  int64 
 15  poutcome   12870 non-null  object
 16  y          12870 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.7+ MB


df_preprocess = df_raw.copy()

df_preprocess['str_campaign'] = df_preprocess['campaign'].astype('str')

coi = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'str_campaign', 'poutcome']
cot = ['y']


for feature in coi:
    contingency_table = pd.crosstab(df_preprocess[feature], df_preprocess['y'])
    chi2 , p ,dof, expected = chi2_contingency(contingency_table)
    print(f"Feature {feature.upper()}: Chi-square value = {chi2}, p-value = {p}")
    if p<0.05 : 
        print(f"{feature.upper()} is SIGNIFICANT")
    else : print(f"{feature.upper()} is insignificant") 

        
# 숫자로 된 것들은 등급처럼 만들어 둡시다. 
df_preprocess.drop('str_campaign', axis=1, inplace=True)
df_preprocess['age'] = df_preprocess['age'].apply(lambda row: math.floor(row/10))
df_preprocess['balance'] = df_preprocess['balance'].apply(lambda row: math.floor(row/100))

Feature JOB: Chi-square value = 445.6149545516124, p-value = 1.2347805365874967e-88
JOB is SIGNIFICANT
Feature MARITAL: Chi-square value = 112.78861212877688, p-value = 3.2230279036603607e-25
MARITAL is SIGNIFICANT
Feature EDUCATION: Chi-square value = 133.87601246919445, p-value = 7.901201277473551e-29
EDUCATION is SIGNIFICANT
Feature DEFAULT: Chi-square value = 15.034769518623209, p-value = 0.0001055485658896835
DEFAULT is SIGNIFICANT
Feature HOUSING: Chi-square value = 481.2516893583131, p-value = 1.141246856051449e-106
HOUSING is SIGNIFICANT
Feature LOAN: Chi-square value = 115.12933233581707, p-value = 7.372869938892642e-27
LOAN is SIGNIFICANT
Feature CONTACT: Chi-square value = 660.7373487393592, p-value = 3.3320209821978656e-144
CONTACT is SIGNIFICANT
Feature STR_CAMPAIGN: Chi-square value = 239.55703590479362, p-value = 2.541932665635015e-32
STR_CAMPAIGN is SIGNIFICANT
Feature POUTCOME: Chi-square value = 1352.8799481901515, p-value = 4.940174194847549e-293
POUTCOME is SIGNIFICANT


df_preprocess['previous'].value_counts().plot()


df_preprocess['pdays'].value_counts().sort_index(ascending=True).plot()


df_preprocess['campaign'].value_counts().plot()


df_preprocess['campaign'].value_counts()[:10]

1     5222
2     3493
3     1556
4      990
5      458
6      337
7      190
8      152
9       87
10      72
Name: campaign, dtype: int64


dft_job = df_preprocess.groupby(['job', 'y']).size().unstack()
dft_job['%yes'] = dft_job['yes'] / (dft_job['no'] + dft_job['yes']) * 100
dft_job.sort_values(by="%yes", ascending=False)


dft_balance = df_preprocess.groupby(['balance', 'y']).size().unstack().plot()
print(dft_balance)

AxesSubplot(0.125,0.125;0.775x0.755)


df_preprocess['y'].value_counts().plot(kind='bar')


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import *
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
import matplotlib.pyplot as plt

# ID는 제거
df_refined = df_preprocess.drop(columns ='ID').copy()

# Feature Selection 
coi = ['age', 'job', 'marital', 'education', 'default', 'contact', 'day', 'month', 'pdays', 'previous', 'y']
cor = [x for x in df_refined.columns if x not in coi]
df_refined = df_refined.drop(columns=cor)

# Feature 변형
if 'previous' in coi :
    df_refined['previous'] = df_refined['previous'].apply(lambda row: 0 if row==0 else 1)
if 'pdays' in coi :
    df_refined['pdays'] = df_refined['pdays'].apply(lambda row: 0 if row==0 else 1)
if 'campaign' in coi :
    df_refined['campaign'] = df_refined['campaign'].apply(lambda row: row if row>4 else 5)
    df_refined['campaign'] = df_refined['campaign'].astype('str')

# y Label을 빼고,
y_train = df_refined.pop('y')

# Random Forest니까, drop없이 (Reference Class 없이) 그냥 모두 dummy화 
x_train_raw = df_refined.copy()
x_train = pd.get_dummies(x_train_raw)


# train과 test를 분리!
# y_train의 불균형(Data Imbanace)을 stratify로 해소
x_train,x_test,y_train,y_test = train_test_split(x_train, y_train, stratify=y_train, test_size=0.3, random_state=4)

print(len(x_train))
print(len(x_test))

model = RandomForestClassifier()

# 학습! 가즈아!
model.fit(x_train, y_train)

# 예츠윽!
y_pred = model.predict(x_test)

# 결과 확인 
print('훈련세트 정확도: {:.3f}' .format(model.score(x_train, y_train)))
print('테스트세트 정확도: {:.3f}' .format(model.score(x_test, y_test)))

9009
3861
훈련세트 정확도: 0.950
테스트세트 정확도: 0.727


from xgboost import XGBClassifier

# xgboost는 자동으로 labeling안해주니까, label y를 dummy encoding!
y_train = pd.get_dummies(y_train, drop_first=True)
y_test = pd.get_dummies(y_test, drop_first=True)

model = XGBClassifier(random_state=11)

# 학습 가즈아!
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

# 성능을 확인
print('훈련세트 정확도: {:.3f}' .format(model.score(x_train, y_train)))
print('테스트세트 정확도: {:.3f}' .format(model.score(x_test, y_test)))

훈련세트 정확도: 0.856
테스트세트 정확도: 0.775


import numpy as np
from tensorflow import keras  # tensorflow에 비해서 keras버전이 너무 높을 때에는 tensorflow에서 직접 import!
from tensorflow.keras import layers
from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

hidden_nodes = 6 #len(x_train.columns)

def build_model() :
    model = keras.Sequential([
        layers.Dense(hidden_nodes, activation='relu', input_shape=[len(x_train.columns)]),
        layers.Dense(38, activation='relu'),
        layers.Dense(1, activation='sigmoid')
      ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model
    
model = build_model()

# Overfitting 되는 것을 방지하기 위해 early_stop! 설정.
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('## %d ## '%(epoch), end='')

history = model.fit(
  x_train, y_train,
  epochs=300, batch_size=100, validation_split = 0.2, verbose=0,
  callbacks=[early_stop, PrintDot()])

## 0 ## ## 100 ##


loss, accuracy = model.evaluate(x_test, y_test, verbose=0)  # evaluate는 자동으로 이진화 해서 비교해 줌 
loss, accuracy

(0.5346112819231721, 0.76016575)


from sklearn.metrics import accuracy_score

# 모델이 예측한 출력
y_pred = model.predict(x_test)

display(y_pred)

# 출력을 이진화하여 0 또는 1로 변환
y_pred_binary = (y_pred > 0.5).astype(int)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred_binary)

accuracy

array([[0.1002959 ],
       [0.7983074 ],
       [0.10381174],
       ...,
       [0.15230204],
       [0.22632809],
       [0.19036685]], dtype=float32)

0.7601657601657602


hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()


import matplotlib.pyplot as plt

# 6 훈련 과정 시각화 (정확도)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# 7 훈련 과정 시각화 (손실)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

y	no	yes
education
primary	1424	456
secondary	4555	1813
tertiary	2559	1516
unknown	365	182

marital	job	divorced	married	single
0	admin.	207.0	762.0	495.0
1	blue-collar	205.0	1775.0	591.0
2	entrepreneur	44.0	272.0	67.0
3	housemaid	49.0	245.0	40.0
4	management	323.0	1580.0	955.0

marital	job	divorced	married	single	ratio
0	admin.	207.0	762.0	495.0	0.271654
1	blue-collar	205.0	1775.0	591.0	0.115493
2	entrepreneur	44.0	272.0	67.0	0.161765
3	housemaid	49.0	245.0	40.0	0.200000
4	management	323.0	1580.0	955.0	0.204430

y	no	yes	%yes
job
student	152	206	57.541899
retired	378	392	50.909091
unemployed	260	154	37.198068
management	1876	982	34.359692
self-employed	310	144	31.718062
unknown	55	25	31.250000
admin.	1007	457	31.215847
technician	1511	630	29.425502
services	769	274	26.270374
housemaid	247	87	26.047904
entrepreneur	292	91	23.759791
blue-collar	2046	525	20.420070

	loss	accuracy	val_loss	val_accuracy	epoch
106	0.500487	0.776329	0.504779	0.783019	106
107	0.498914	0.777855	0.504862	0.784684	107
108	0.498912	0.778410	0.504304	0.782464	108
109	0.498502	0.777855	0.506623	0.780244	109
110	0.500490	0.777300	0.505452	0.784684	110

	ID	age	job	marital	education	default	balance	housing	loan	contact	day	month	campaign	pdays	poutcome	y
0	13829	29	technician	single	tertiary	no	18254	no	no	cellular	11	may	2	-1	unknown	no
1	22677	26	services	single	secondary	no	512	yes	yes	unknown	5	jun	3	-1	unknown	no
2	10541	30	management	single	secondary	no	135	no	no	cellular	14	aug	2	-1	unknown	no
3	13689	41	technician	married	unknown	no	30	yes	no	cellular	10	jul	1	-1	unknown	no
4	11304	27	admin.	single	secondary	no	321	no	yes	unknown	2	sep	1	-1	unknown	no

marital	divorced	married	single
job
admin.	207	762	495
blue-collar	205	1775	591
entrepreneur	44	272	67
housemaid	49	245	40
management	323	1580	955
retired	157	572	41
self-employed	39	284	131
services	138	584	321
student	0	24	334
technician	246	1115	780
unemployed	62	219	133
unknown	5	58	17

marital	job	divorced	married	single	ratio
10	unemployed	62.0	219.0	133.0	0.283105
5	retired	157.0	572.0	41.0	0.274476

티스토리툴바