import pandas as pd 

df_raw = pd.read_csv('train_ctrUa4K.csv')
df_raw.head()


df_raw.info() # data type좀 보고


RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


df_raw.dropna(inplace=True) # null 날리고


df_raw.info() # 어느정도 남았나 보고


Int64Index: 480 entries, 1 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            480 non-null    object 
 1   Gender             480 non-null    object 
 2   Married            480 non-null    object 
 3   Dependents         480 non-null    object 
 4   Education          480 non-null    object 
 5   Self_Employed      480 non-null    object 
 6   ApplicantIncome    480 non-null    int64  
 7   CoapplicantIncome  480 non-null    float64
 8   LoanAmount         480 non-null    float64
 9   Loan_Amount_Term   480 non-null    float64
 10  Credit_History     480 non-null    float64
 11  Property_Area      480 non-null    object 
 12  Loan_Status        480 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 52.5+ KB


import matplotlib.pyplot as plt
coi = df_raw.select_dtypes('object').columns.to_list()  # object column만 가져오고
coi.remove('Loan_ID') # 필요 없는거 빼고 

fig, axes = plt.subplots(nrows=len(coi)//2 + len(coi)%2, ncols=2, figsize=(6, 15))

for ax, col in zip(axes.flatten(), coi):
    df_raw[col].value_counts().plot(kind='bar', ax=ax)
    ax.set_title(col)

plt.tight_layout()
plt.show()


import matplotlib.pyplot as plt

df_raw.hist(figsize=(7,5))
plt.tight_layout()
plt.show()


from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

df_data = df_raw.copy()
df_data.drop('Loan_ID', axis=1, inplace=True)

# 범주형 Feature와 연속형 Feature를 구분해 보자 
col_cate = data.select_dtypes('object').columns.to_list()
col_cont = data.select_dtypes('float').columns.to_list()

print(col_cate, col_cont)

# 범주형 피처를 숫자로 인코딩
for col in col_cate :
    label_encoder = LabelEncoder()
    df_data[col] = label_encoder.fit_transform(data[col])

# 연속형 피처를 스케일링
scaler = MinMaxScaler()
df_data[col_cont] = scaler.fit_transform(data[col_cont])

# 데이터를 Feature와 Label로 구분 
y_train = df_data.pop('Loan_Status')
x_train = df_data.copy()
x_train = pd.get_dummies(x_train)
y_train = pd.get_dummies(y_train, drop_first=True)

x_train,x_test,y_train,y_test = train_test_split(x_train, y_train, stratify=y_train, test_size=0.2, random_state=4)

# 모형을 설정하고
model = XGBClassifier(random_state=11)

# 학습 가즈아!
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

# 성능을 확인
print('훈련세트 정확도: {:.3f}' .format(model.score(x_train, y_train)))
print('테스트세트 정확도: {:.3f}' .format(model.score(x_test, y_test)))

[] ['CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
384 96 384 96
훈련세트 정확도: 1.000
테스트세트 정확도: 0.771


df_raw['Credit_History'] =  df_raw['Credit_History'].astype('str')
coi.append('Credit_History')


from scipy.stats import chi2_contingency

print(coi)
lst_coi = []

for feature in coi:
    if feature == 'Loan_Status' : continue # 이건 Label이니까 빼고 
    contingency_table = pd.crosstab(df_raw[feature], df_raw['Loan_Status'])
    chi2 , p ,dof, expected = chi2_contingency(contingency_table)
    if p<0.05 : 
        print(f"{feature} is SIGNIFICANT")
        print(f"Chi-square value = {chi2}, p-value = {p}")
        lst_coi.append(feature)
    else : 
        print(f"{feature} is insignificant") 
        
print("SIGNIFICANT FEATURES")  # 유의한 Feature만 보고 
lst_coi

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status', 'Credit_History']
Gender is insignificant
Married is SIGNIFICANT
Chi-square value = 5.557140235492528, p-value = 0.018405456386355375
Dependents is insignificant
Education is insignificant
Self_Employed is insignificant
Property_Area is SIGNIFICANT
Chi-square value = 12.2259455519901, p-value = 0.0022139594148752133
Credit_History is SIGNIFICANT
Chi-square value = 131.29328312402075, p-value = 2.135981766869101e-30
SIGNIFICANT FEATURES

['Married', 'Property_Area', 'Credit_History']


from sklearn.preprocessing import MinMaxScaler

df_preprocess = df_raw.copy()
cols = ['LoanAmount']

scaler = MinMaxScaler()
df_preprocess[cols] = scaler.fit_transform(df_preprocess[cols])  # MinMax Scaling
df_preprocess.head()


df_loan_y = df_preprocess[df_preprocess['Loan_Status']=='Y']['LoanAmount']
df_loan_n = df_preprocess[df_preprocess['Loan_Status']=='N']['LoanAmount']

from scipy.stats import bartlett
bartlett(df_loan_y, df_loan_n)

BartlettResult(statistic=6.771581019957911, pvalue=0.009262074884700013)


from scipy.stats import ttest_ind
ttest_ind(df_loan_y, df_loan_n, equal_var=False)

Ttest_indResult(statistic=-1.4692607777044502, pvalue=0.14305832311596678)


coi = ['Married', 'Property_Area', 'Credit_History', 'Loan_Status']

df_data = df_preprocess[coi].copy()
df_data.head()


from sklearn.model_selection import train_test_split

print(df_data.head())
y_train = df_data.pop('Loan_Status')
x_train = df_data.copy()
x_train = pd.get_dummies(x_train)
y_train = pd.get_dummies(y_train, drop_first=True)

x_train,x_test,y_train,y_test = train_test_split(x_train, y_train, stratify=y_train, test_size=0.2, random_state=4)

len(x_train), len(x_test), len(y_train), len(y_test)

  Married Property_Area Credit_History Loan_Status
1     Yes         Rural            1.0           N
2     Yes         Urban            1.0           Y
3     Yes         Urban            1.0           Y
4      No         Urban            1.0           Y
5     Yes         Urban            1.0           Y

(384, 96, 384, 96)


from xgboost import XGBClassifier

model = XGBClassifier(random_state=11)

# 학습 가즈아!
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

# 성능을 확인
print('훈련세트 정확도: {:.3f}' .format(model.score(x_train, y_train)))
print('테스트세트 정확도: {:.3f}' .format(model.score(x_test, y_test)))

훈련세트 정확도: 0.810
테스트세트 정확도: 0.802

	Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
0	LP001002	Male	No	0	Graduate	No	5849	0.0	NaN	360.0	1.0	Urban	Y
1	LP001003	Male	Yes	1	Graduate	No	4583	1508.0	128.0	360.0	1.0	Rural	N
2	LP001005	Male	Yes	0	Graduate	Yes	3000	0.0	66.0	360.0	1.0	Urban	Y
3	LP001006	Male	Yes	0	Not Graduate	No	2583	2358.0	120.0	360.0	1.0	Urban	Y
4	LP001008	Male	No	0	Graduate	No	6000	0.0	141.0	360.0	1.0	Urban	Y

티스토리툴바