#importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

train = pd.read_csv("/content/wage2015_train.csv") # importing train dataset
test = pd.read_csv("/content/wage2015_test.csv")# importing test dataset

train.head() # first 5 records of training dataset

test.head() # first 5 records of test dataset

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6348 entries, 0 to 6347
Data columns (total 37 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   wage      6348 non-null   float64
 1   lwage     6348 non-null   float64
 2   sex       6348 non-null   int64  
 3   white     6348 non-null   int64  
 4   black     6348 non-null   int64  
 5   hisp      6348 non-null   int64  
 6   shs       6348 non-null   int64  
 7   hsg       6348 non-null   int64  
 8   scl       6348 non-null   int64  
 9   clg       6348 non-null   int64  
 10  mw        6348 non-null   int64  
 11  so        6348 non-null   int64  
 12  we        6348 non-null   int64  
 13  union     6348 non-null   int64  
 14  vet       6348 non-null   int64  
 15  cent      6348 non-null   int64  
 16  ncent     6348 non-null   int64  
 17  fam1      6348 non-null   int64  
 18  fam2      6348 non-null   int64  
 19  fam3      6348 non-null   int64  
 20  child     6348 non-null   int64  
 21  fborn     6348 non-null   int64  
 22  cit       6348 non-null   int64  
 23  school    6348 non-null   int64  
 24  pens      6348 non-null   int64  
 25  fsize10   6348 non-null   int64  
 26  fsize100  6348 non-null   int64  
 27  health    6348 non-null   int64  
 28  age       6348 non-null   int64  
 29  exp1      6348 non-null   float64
 30  exp2      6348 non-null   float64
 31  exp3      6348 non-null   float64
 32  exp4      6348 non-null   float64
 33  occ       6348 non-null   float64
 34  occ2      6348 non-null   int64  
 35  ind       6348 non-null   float64
 36  ind2      6348 non-null   int64  
dtypes: float64(8), int64(29)
memory usage: 1.8 MB

# Printing the summary statistics for the dataset
train.describe().T

fig = plt.figure(figsize=(10,4))

#  subplot #1
plt.subplot(121)
sns.histplot(data = train, x = 'wage')

#  subplot #2
plt.subplot(122)
sns.boxplot(data = train, x = 'wage')

plt.show()

fig = plt.figure(figsize=(10,4))

#  subplot #1
plt.subplot(121)
sns.histplot(data = train, x = 'lwage')

#  subplot #2
plt.subplot(122)
sns.boxplot(data = train, x = 'lwage')

plt.show()

sns.displot(x='age',data=train)
plt.show()

sns.countplot(x="sex", data=train)
plt.show()

# Linear Control Variables. Use This Control Variables for Tree Based Machine Learning Methods.
x_col = ["sex","white","black","hisp","shs","hsg","scl","clg","mw","so","we","union","vet","cent","ncent","fam1","fam2","fam3","child","fborn","cit","school","pens","fsize10","fsize100","health","age","exp1","occ2","ind2"]
# Quadratic Control Variables (Flexible Specification). Use This Control Variables for Linear methods
xL_col = ["sex","white","black","hisp","shs","hsg","scl","clg","mw","so","we","union","vet","cent","ncent","fam1","fam2","fam3","child","fborn","cit","school","pens","fsize10","fsize100","health","age","exp1","exp2","exp3","exp4","occ2","ind2"]

X_train = train[x_col]
X_test = test[x_col]

# outcome variable log wage
y_train = train['lwage']
y_test = test['lwage']

# preparing the quadratic large specification data for train and test

from Scikit-learn.preprocessing import PolynomialFeatures
# Initialize the PolynomialFeatures object

poly = PolynomialFeatures(interaction_only=True, include_bias=False)

# Preparing the data
XL_train = train[xL_col]
XL_test = test[xL_col]

# Transform the training data
XL_train_poly = poly.fit_transform(XL_train)
XL_train_poly = pd.DataFrame(XL_train_poly, columns=poly.get_feature_names_out(XL_train.columns))

# Transform the test data
XL_test_poly = poly.transform(XL_test)
XL_test_poly = pd.DataFrame(XL_test_poly, columns=poly.get_feature_names_out(XL_test.columns))

from Scikit-learn.linear_model import LinearRegression
# Linear Model: Quadratic (Large) Specification without intercept
model_train_L = LinearRegression(fit_intercept=False)
model_train_L.fit(XL_train,y_train)
model_test_L = LinearRegression(fit_intercept=False)
model_test_L.fit(XL_test,y_test)


# Linear Model: Linear Specification without intercept
model_train = LinearRegression(fit_intercept=False)
model_train.fit(X_train,y_train)
model_test = LinearRegression(fit_intercept=False)
model_test.fit(X_test,y_test)

LinearRegression(fit_intercept=False)

LinearRegression(fit_intercept=False)

# Linear Model: Linear Specification with intercept
model_lm = LinearRegression()
model_lm.fit(X_train,y_train)

# Linear Model: Quadratic (Large) Specification with intercept
model_lm2 = LinearRegression()
model_lm2.fit(XL_train,y_train)

LinearRegression()

LinearRegression()

from Scikit-learn.linear_model import Lasso, LassoCV, RidgeCV, ElasticNetCV

# Basic Lasso model on original data
model_lasso = Lasso()
model_lasso.fit(X_train, y_train)

# LassoCV model on original data
# cv - number of cross validations to be checked, n_jobs = -1 - use all the parallel processing power of the system
model_lasso_cv = LassoCV(alphas=[0.1, 1, 10], cv=11, n_jobs=-1)
model_lasso_cv.fit(X_train, y_train)

# RidgeCV model on original data
model_ridge_cv = RidgeCV(alphas=[0.1, 1, 10], cv=11)
model_ridge_cv.fit(X_train, y_train)

# ElasticNetCV model on original data
model_elasticnet_cv = ElasticNetCV(alphas=[0.5], cv=11, n_jobs=-1)
model_elasticnet_cv.fit(X_train, y_train)

ElasticNetCV(alphas=[0.5], cv=11, n_jobs=-1)

ElasticNetCV(alphas=[0.5], cv=11, n_jobs=-1)

# basic lasso model on quadratic transform data
model_lasso = Lasso()
model_lasso.fit(X_train, y_train)

# LassoCV model on original data
model_lasso_cv = LassoCV(alphas=[1], cv=11, n_jobs=-1)
model_lasso_cv.fit(X_train, y_train)

# RidgeCV model on quadratic transform data
# Provide a list of positive alpha values for RidgeCV
model_ridge_cv_L = RidgeCV(alphas=[0.1, 1.0, 10.0], cv=11)
model_ridge_cv_L.fit(XL_train, y_train)

# ElasticNetCV model on original data
model_elasticnet_cv = ElasticNetCV(alphas=[0.5], cv=11, n_jobs=-1)
model_elasticnet_cv.fit(X_train, y_train)

ElasticNetCV(alphas=[0.5], cv=11, n_jobs=-1)

ElasticNetCV(alphas=[0.5], cv=11, n_jobs=-1)

from Scikit-learn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# random forest regresison on original data

rf = RandomForestRegressor(n_estimators=2000,min_samples_leaf=5) # n_estimators - The number of trees in the forest, min_samples_leaf - The minimum number of samples required to be at a leaf node.
rf.fit(X_train,y_train)

# gradient boost regresison on original data
gbr = GradientBoostingRegressor(n_estimators=1000,min_weight_fraction_leaf=0.5,max_depth=2,learning_rate=0.01)
# max_depth - The maximum depth of the tree
# min_weight_fraction_leaf - The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node

gbr.fit(X_train,y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=2,
                          min_weight_fraction_leaf=0.5, n_estimators=1000)

GradientBoostingRegressor(learning_rate=0.01, max_depth=2,
                          min_weight_fraction_leaf=0.5, n_estimators=1000)

from Scikit-learn.tree import DecisionTreeRegressor
# decision tree regresison on original data
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)
dt_pruned = DecisionTreeRegressor(max_depth=5)
dt_pruned.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=5)

DecisionTreeRegressor(max_depth=5)

# predictions on test data for all the models build so far
y_pred_lm = model_lm.predict(X_test)
y_pred_lm2 = model_lm2.predict(XL_test)
y_pred_lasso = model_lasso.predict(X_test)
y_pred_lasso_cv = model_lasso_cv.predict(X_test)
y_pred_ridge_cv = model_ridge_cv.predict(X_test)
y_pred_elasticnet_cv = model_elasticnet_cv.predict(X_test)
y_pred_lasso_L = model_lasso_L.predict(XL_test)
y_pred_lasso_cv_L = model_lasso_cv_L.predict(XL_test)
y_pred_ridge_cv_L = model_ridge_cv_L.predict(XL_test)
y_pred_elasticnet_cv_L = model_elasticnet_cv_L.predict(XL_test)
y_pred_rf = rf.predict(X_test)
y_pred_gbr = gbr.predict(X_test)
y_pred_dt = dt.predict(X_test)
y_pred_dt_pruned = dt_pruned.predict(X_test)

from Scikit-learn.metrics import r2_score,mean_squared_error
# linear model metrics
lm_r2 = r2_score(y_test,y_pred_lm)
lm_mse = mean_squared_error(y_test,y_pred_lm)
print("lm_r2",lm_r2)
print("lm_mse",lm_mse)
print()

# quadratic model metrics
lm2_r2 = r2_score(y_test,y_pred_lm2)
lm2_mse = mean_squared_error(y_test,y_pred_lm2)
print("lm2_r2",lm2_r2)
print("lm2_mse",lm2_mse)
print()

# lasso model metrics
lasso_r2 = r2_score(y_test,y_pred_lasso)
lasso_mse = mean_squared_error(y_test,y_pred_lasso)
print("lasso_r2",lasso_r2)
print("lasso_mse",lasso_mse)
print()

# lassocv model metrics
lasso_cv_r2 = r2_score(y_test,y_pred_lasso_cv)
lasso_cv_mse = mean_squared_error(y_test,y_pred_lasso_cv)
print("lasso_cv_r2",lasso_cv_r2)
print("lasso_cv_mse",lasso_cv_mse)
print()

# ridgecv model metrics
ridge_cv_r2 = r2_score(y_test,y_pred_ridge_cv)
ridge_cv_mse = mean_squared_error(y_test,y_pred_ridge_cv)
print("ridge_cv_r2",ridge_cv_r2)
print("ridge_cv_mse",ridge_cv_mse)
print()

# elasticnetcv model metrics
elasticnet_cv_r2 = r2_score(y_test,y_pred_elasticnet_cv)
elasticnet_cv_mse = mean_squared_error(y_test,y_pred_elasticnet_cv)
print("elasticnet_cv_r2",elasticnet_cv_r2)
print("elasticnet_cv_mse",elasticnet_cv_mse)
print()

# lasso on quadratic data metrics
lasso_L_r2 = r2_score(y_test,y_pred_lasso_L)
lasso_L_mse = mean_squared_error(y_test,y_pred_lasso_L)
print("lasso_L_r2",lasso_L_r2)
print("lasso_L_mse",lasso_L_mse)
print()

# lassocv on quadratic data metrics
lasso_cv_L_r2 = r2_score(y_test,y_pred_lasso_cv_L)
lasso_cv_L_mse = mean_squared_error(y_test,y_pred_lasso_cv_L)
print("lasso_cv_L_r2",lasso_cv_L_r2)
print("lasso_cv_L_mse",lasso_cv_L_mse)
print()

# ridgecv on quadratic data metrics
ridge_cv_L_r2 = r2_score(y_test,y_pred_ridge_cv_L)
ridge_cv_L_mse = mean_squared_error(y_test,y_pred_ridge_cv_L)
print("ridge_cv_L_r2",ridge_cv_L_r2)
print("ridge_cv_L_mse",ridge_cv_L_mse)
print()

# elasticnetcv on quadratic data metrics
elasticnet_cv_L_r2 = r2_score(y_test,y_pred_elasticnet_cv_L)
elasticnet_cv_L_mse = mean_squared_error(y_test,y_pred_elasticnet_cv_L)
print("elasticnet_cv_L_r2",elasticnet_cv_L_r2)
print("elasticnet_cv_L_mse",elasticnet_cv_L_mse)
print()

# randomforest model metrics
rf_r2 = r2_score(y_test,y_pred_rf)
rf_mse = mean_squared_error(y_test,y_pred_rf)
print("rf_r2",rf_r2)
print("rf_mse",rf_mse)
print()

# gradientboost model metrics
gbr_r2 = r2_score(y_test,y_pred_gbr)
gbr_mse = mean_squared_error(y_test,y_pred_gbr)
print("gbr_r2",gbr_r2)
print("gbr_mse",gbr_mse)
print()

# decision tree model metrics
dt_r2 = r2_score(y_test,y_pred_dt)
dt_mse = mean_squared_error(y_test,y_pred_dt)
print("dt_r2",dt_r2)
print("dt_mse",dt_mse)
print()

# pruned decision tree model metrics
dt_pruned_r2 = r2_score(y_test,y_pred_dt_pruned)
dt_pruned_mse = mean_squared_error(y_test,y_pred_dt_pruned)
print("dt_pruned_r2",dt_pruned_r2)
print("dt_pruned_mse",dt_pruned_mse)

lm_r2 0.2564591769934085
lm_mse 0.3127664499897118

lm2_r2 0.2602708126287935
lm2_mse 0.31116310595069446

lasso_r2 0.04811131393532875
lasso_mse 0.4004068585258824

lasso_cv_r2 0.04811131393532875
lasso_cv_mse 0.4004068585258824

ridge_cv_r2 0.25655045286270683
ridge_cv_mse 0.31272805528598224

elasticnet_cv_r2 0.11137542178631354
elasticnet_cv_mse 0.3737951516604702

lasso_L_r2 0.04811131393532875
lasso_L_mse 0.4004068585258824

lasso_cv_L_r2 0.04811131393532875
lasso_cv_L_mse 0.4004068585258824

ridge_cv_L_r2 0.26035753454659527
ridge_cv_L_mse 0.31112662684218034

elasticnet_cv_L_r2 0.12063635718219501
elasticnet_cv_L_mse 0.36989958897214126

rf_r2 0.28773110798677815
rf_mse 0.29961208033241493

gbr_r2 -0.0002358120438648026
gbr_mse 0.420743817159267

dt_r2 -0.29610208022972073
dt_mse 0.5451983723214303

dt_pruned_r2 0.21722036231287212
dt_pruned_mse 0.32927204644077146

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/My Drive/Colab Notebooks/Copy of FDS_Project_LearnerNotebook_FullCode.ipynb"

	wage	lwage	sex	white	scl	clg	...	health	age	exp1	exp2	exp3	exp4	occ	occ2	ind	ind2
0	11.639676	2.454420	0	1	1	0	...	1	62	40.0	16.00	64.000	256.0000	4620.0	15	8290.0	18
1	26.442308	3.274965	0	1	0	1	...	1	23	0.0	0.00	0.000	0.0000	2300.0	8	8470.0	18
2	20.000000	2.995732	1	1	0	1	...	1	31	7.0	0.49	0.343	0.2401	2720.0	9	8590.0	19
3	26.442308	3.274965	1	1	0	1	...	0	31	7.0	0.49	0.343	0.2401	5300.0	17	8660.0	20
4	25.480769	3.237924	0	1	0	1	...	1	28	4.0	0.16	0.064	0.0256	40.0	1	7470.0	14

	wage	lwage	sex	white	hsg	scl	clg	...	health	age	exp1	exp2	exp3	exp4	occ	occ2	ind	ind2
0	9.615385	2.263364	0	1	0	0	1	...	1	31	7.0	0.49	0.343	0.2401	3600.0	11	8370.0	18
1	11.538462	2.445686	1	1	0	1	0	...	1	35	13.0	1.69	2.197	2.8561	6050.0	18	170.0	1
2	28.846154	3.361977	0	1	0	0	1	...	0	46	22.0	4.84	10.648	23.4256	2015.0	6	9470.0	22
3	19.230769	2.956512	1	1	1	0	0	...	1	57	37.0	13.69	50.653	187.4161	5240.0	17	5680.0	9
4	11.500000	2.442347	1	1	1	0	0	...	0	23	3.0	0.09	0.027	0.0081	8740.0	21	7790.0	16

	count	mean	std	min	25%	50%	75%	max
wage	6348.0	18.151370	10.390258	0.015385	10.576923	15.384615	23.367102	57.692308
lwage	6348.0	2.735925	0.602504	-4.174387	2.358675	2.733368	3.151328	4.055124
sex	6348.0	0.541115	0.498346	0.000000	0.000000	1.000000	1.000000	1.000000
white	6348.0	0.697385	0.459426	0.000000	0.000000	1.000000	1.000000	1.000000
black	6348.0	0.184625	0.388024	0.000000	0.000000	0.000000	0.000000	1.000000
hisp	6348.0	0.225425	0.417895	0.000000	0.000000	0.000000	0.000000	1.000000
shs	6348.0	0.062697	0.242436	0.000000	0.000000	0.000000	0.000000	1.000000
hsg	6348.0	0.307656	0.461560	0.000000	0.000000	0.000000	1.000000	1.000000
scl	6348.0	0.299779	0.458197	0.000000	0.000000	0.000000	1.000000	1.000000
clg	6348.0	0.239918	0.427067	0.000000	0.000000	0.000000	0.000000	1.000000
mw	6348.0	0.187146	0.390059	0.000000	0.000000	0.000000	0.000000	1.000000
so	6348.0	0.349716	0.476918	0.000000	0.000000	0.000000	1.000000	1.000000
we	6348.0	0.278198	0.448147	0.000000	0.000000	0.000000	1.000000	1.000000
union	6348.0	0.020006	0.140033	0.000000	0.000000	0.000000	0.000000	1.000000
vet	6348.0	0.030088	0.170844	0.000000	0.000000	0.000000	0.000000	1.000000
cent	6348.0	0.365942	0.481731	0.000000	0.000000	0.000000	1.000000	1.000000
ncent	6348.0	0.348929	0.476669	0.000000	0.000000	0.000000	1.000000	1.000000
fam1	6348.0	0.536232	0.498725	0.000000	0.000000	1.000000	1.000000	1.000000
fam2	6348.0	0.139256	0.346241	0.000000	0.000000	0.000000	0.000000	1.000000
fam3	6348.0	0.130277	0.336635	0.000000	0.000000	0.000000	0.000000	1.000000
child	6348.0	0.162571	0.369003	0.000000	0.000000	0.000000	0.000000	1.000000
fborn	6348.0	0.163989	0.370294	0.000000	0.000000	0.000000	0.000000	1.000000
cit	6348.0	0.907845	0.289267	0.000000	1.000000	1.000000	1.000000	1.000000
school	6348.0	0.148236	0.355361	0.000000	0.000000	0.000000	0.000000	1.000000
pens	6348.0	0.386421	0.486967	0.000000	0.000000	0.000000	1.000000	1.000000
fsize10	6348.0	0.109483	0.312269	0.000000	0.000000	0.000000	0.000000	1.000000
fsize100	6348.0	0.158633	0.365362	0.000000	0.000000	0.000000	0.000000	1.000000
health	6348.0	0.714713	0.451586	0.000000	0.000000	1.000000	1.000000	1.000000
age	6348.0	33.825142	11.485257	16.000000	25.000000	30.000000	40.000000	85.000000
exp1	6348.0	12.160996	11.535660	0.000000	3.000000	8.000000	19.000000	65.000000
exp2	6348.0	2.809403	4.602627	0.000000	0.090000	0.640000	3.610000	42.250000
exp3	6348.0	8.391547	20.015144	0.000000	0.027000	0.512000	6.859000	274.625000
exp4	6348.0	29.073584	97.415275	0.000000	0.008100	0.409600	13.032100	1785.062500
occ	6348.0	8466.476528	19570.421355	10.000000	2320.000000	4700.000000	6200.000000	100000.000000
occ2	6348.0	12.876812	6.688689	1.000000	8.000000	15.000000	17.000000	22.000000
ind	6348.0	7843.791273	11938.950959	170.000000	4950.000000	7380.000000	8270.000000	100000.000000
ind2	6348.0	13.232199	5.890789	1.000000	9.000000	14.000000	18.000000	23.000000

PREDICTING WAGES - 2¶

Importing the necessary libraries and overview of the dataset¶

Loading the data¶

Checking the info of the data¶

Univariate Analysis¶

Checking the summary statistics of the dataset¶

Now that we've completed our analysis, let's move on to modelling.¶

Linear Models¶

Non-Linear Models¶

Conclusion:¶