!kaggle datasets download -d bumba5341/advertisingcsv


!unzip advertisingcsv.zip

Archive:  advertisingcsv.zip
  inflating: Advertising.csv


import pandas as pd
df_raw = pd.read_csv("./Advertising.csv")
df_raw.head()


df_raw.drop("Unnamed: 0", axis=1, inplace=True)
df_raw.head()


df_raw.shape

(200, 4)


df_raw.info()


RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


df_raw.describe()


df_raw.hist(figsize=(10,7))

array([[,
        ],
       [,
        ]],
      dtype=object)


df_raw.corr()


import matplotlib.pyplot as plt

df_raw.plot.scatter(x='TV', y='Sales')
df_raw.plot.scatter(x='Radio', y='Sales')
df_raw.plot.scatter(x='Newspaper', y='Sales')


from statsmodels.formula.api import ols
model_ols_full = ols("Sales ~ TV + Radio + Newspaper", data=df_raw).fit()
model_ols_full.summary()


model_ols_featured = ols("Sales ~ TV + Radio", data=df_raw).fit()
model_ols_featured.summary()


import math
df_raw['Newspaper_log'] = df_raw['Newspaper'].apply(lambda row: math.log(row))
df_raw.plot.scatter(x='Newspaper_log', y='Sales')


model_ols_featured.predict({'TV': 1000, 'Radio':1000})

0    236.670142
dtype: float64

	Unnamed: 0	TV	Radio	Newspaper	Sales
0	1	230.1	37.8	69.2	22.1
1	2	44.5	39.3	45.1	10.4
2	3	17.2	45.9	69.3	9.3
3	4	151.5	41.3	58.5	18.5
4	5	180.8	10.8	58.4	12.9

	TV	Radio	Newspaper	Sales
0	230.1	37.8	69.2	22.1
1	44.5	39.3	45.1	10.4
2	17.2	45.9	69.3	9.3
3	151.5	41.3	58.5	18.5
4	180.8	10.8	58.4	12.9

	TV	Radio	Newspaper	Sales
count	200.000000	200.000000	200.000000	200.000000
mean	147.042500	23.264000	30.554000	14.022500
std	85.854236	14.846809	21.778621	5.217457
min	0.700000	0.000000	0.300000	1.600000
25%	74.375000	9.975000	12.750000	10.375000
50%	149.750000	22.900000	25.750000	12.900000
75%	218.825000	36.525000	45.100000	17.400000
max	296.400000	49.600000	114.000000	27.000000

	TV	Radio	Newspaper	Sales
TV	1.000000	0.054809	0.056648	0.782224
Radio	0.054809	1.000000	0.354104	0.576223
Newspaper	0.056648	0.354104	1.000000	0.228299
Sales	0.782224	0.576223	0.228299	1.000000

Dep. Variable:	Sales	R-squared:	0.897
Model:	OLS	Adj. R-squared:	0.896
Method:	Least Squares	F-statistic:	570.3
Date:		Prob (F-statistic):	1.58e-96
Time:		Log-Likelihood:	-386.18
No. Observations:	200	AIC:	780.4
Df Residuals:	196	BIC:	793.6
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	2.9389	0.312	9.422	0.000	2.324	3.554
TV	0.0458	0.001	32.809	0.000	0.043	0.049
Radio	0.1885	0.009	21.893	0.000	0.172	0.206
Newspaper	-0.0010	0.006	-0.177	0.860	-0.013	0.011

Omnibus:	60.414	Durbin-Watson:	2.084
Prob(Omnibus):	0.000	Jarque-Bera (JB):	151.241
Skew:	-1.327	Prob(JB):	1.44e-33
Kurtosis:	6.332	Cond. No.	454.

Omnibus:	60.022	Durbin-Watson:	2.081
Prob(Omnibus):	0.000	Jarque-Bera (JB):	148.679
Skew:	-1.323	Prob(JB):	5.19e-33
Kurtosis:	6.292	Cond. No.	425.

티스토리툴바