import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# To scale the data using z-score
from Scikit-learn.preprocessing import StandardScaler

from Scikit-learn.model_selection import train_test_split

# Algorithms to use
from Scikit-learn.linear_model import LogisticRegression

from Scikit-learn.svm import SVC

from Scikit-learn.tree import DecisionTreeClassifier

from Scikit-learn.ensemble import BaggingClassifier

from Scikit-learn.ensemble import RandomForestClassifier

# Metrics to evaluate the model
from Scikit-learn import metrics
from Scikit-learn.metrics import confusion_matrix, classification_report, precision_recall_curve,recall_score
from Scikit-learn import tree


# For tuning the model
from Scikit-learn.model_selection import GridSearchCV

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Connect collab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

# Loading the dataset
df = pd.read_excel('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Mentored_Learning_Session/HR_Employee_Attrition_Prediction_(Classification)/HR_Employee_Attrition_Dataset.xlsx')

df.head()

# Let us see the info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2940 entries, 0 to 2939
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   EmployeeNumber            2940 non-null   int64 
 1   Attrition                 2940 non-null   object
 2   Age                       2940 non-null   int64 
 3   BusinessTravel            2940 non-null   object
 4   DailyRate                 2940 non-null   int64 
 5   Department                2940 non-null   object
 6   DistanceFromHome          2940 non-null   int64 
 7   Education                 2940 non-null   int64 
 8   EducationField            2940 non-null   object
 9   EnvironmentSatisfaction   2940 non-null   int64 
 10  Gender                    2940 non-null   object
 11  HourlyRate                2940 non-null   int64 
 12  JobInvolvement            2940 non-null   int64 
 13  JobLevel                  2940 non-null   int64 
 14  JobRole                   2940 non-null   object
 15  JobSatisfaction           2940 non-null   int64 
 16  MaritalStatus             2940 non-null   object
 17  MonthlyIncome             2940 non-null   int64 
 18  MonthlyRate               2940 non-null   int64 
 19  NumCompaniesWorked        2940 non-null   int64 
 20  Over18                    2940 non-null   object
 21  OverTime                  2940 non-null   object
 22  PercentSalaryHike         2940 non-null   int64 
 23  PerformanceRating         2940 non-null   int64 
 24  RelationshipSatisfaction  2940 non-null   int64 
 25  StandardHours             2940 non-null   int64 
 26  StockOptionLevel          2940 non-null   int64 
 27  TotalWorkingYears         2940 non-null   int64 
 28  TrainingTimesLastYear     2940 non-null   int64 
 29  WorkLifeBalance           2940 non-null   int64 
 30  YearsAtCompany            2940 non-null   int64 
 31  YearsInCurrentRole        2940 non-null   int64 
 32  YearsSinceLastPromotion   2940 non-null   int64 
 33  YearsWithCurrManager      2940 non-null   int64 
dtypes: int64(25), object(9)
memory usage: 781.1+ KB

# Checking unique values in each column
df.nunique()

# Dropping the columns
df=df.drop(['EmployeeNumber','Over18','StandardHours'],axis=1)

# Creating numerical columns
num_cols=['DailyRate','Age','DistanceFromHome','MonthlyIncome','MonthlyRate','PercentSalaryHike','TotalWorkingYears',
          'YearsAtCompany','NumCompaniesWorked','HourlyRate',
          'YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager','TrainingTimesLastYear']

# Creating categorical variables
cat_cols= ['Attrition','OverTime','BusinessTravel', 'Department','Education', 'EducationField','JobSatisfaction','EnvironmentSatisfaction','WorkLifeBalance',
           'StockOptionLevel','Gender', 'PerformanceRating', 'JobInvolvement','JobLevel', 'JobRole', 'MaritalStatus','RelationshipSatisfaction']

# Checking summary statistics
df[num_cols].describe().T

# Creating histograms
df[num_cols].hist(figsize=(14,14))
plt.show()

# Printing the % sub categories of each category
for i in cat_cols:
    print(df[i].value_counts(normalize=True))
    print('*'*40)

Attrition
No     0.838776
Yes    0.161224
Name: proportion, dtype: float64
****************************************
OverTime
No     0.717007
Yes    0.282993
Name: proportion, dtype: float64
****************************************
BusinessTravel
Travel_Rarely        0.709524
Travel_Frequently    0.188435
Non-Travel           0.102041
Name: proportion, dtype: float64
****************************************
Department
Research & Development    0.653741
Sales                     0.303401
Human Resources           0.042857
Name: proportion, dtype: float64
****************************************
Education
3    0.389116
4    0.270748
2    0.191837
1    0.115646
5    0.032653
Name: proportion, dtype: float64
****************************************
EducationField
Life Sciences       0.412245
Medical             0.315646
Marketing           0.108163
Technical Degree    0.089796
Other               0.055782
Human Resources     0.018367
Name: proportion, dtype: float64
****************************************
JobSatisfaction
4    0.312245
3    0.300680
1    0.196599
2    0.190476
Name: proportion, dtype: float64
****************************************
EnvironmentSatisfaction
3    0.308163
4    0.303401
2    0.195238
1    0.193197
Name: proportion, dtype: float64
****************************************
WorkLifeBalance
3    0.607483
2    0.234014
4    0.104082
1    0.054422
Name: proportion, dtype: float64
****************************************
StockOptionLevel
0    0.429252
1    0.405442
2    0.107483
3    0.057823
Name: proportion, dtype: float64
****************************************
Gender
Male      0.6
Female    0.4
Name: proportion, dtype: float64
****************************************
PerformanceRating
3    0.846259
4    0.153741
Name: proportion, dtype: float64
****************************************
JobInvolvement
3    0.590476
2    0.255102
4    0.097959
1    0.056463
Name: proportion, dtype: float64
****************************************
JobLevel
1    0.369388
2    0.363265
3    0.148299
4    0.072109
5    0.046939
Name: proportion, dtype: float64
****************************************
JobRole
Sales Executive              0.221769
Research Scientist           0.198639
Laboratory Technician        0.176190
Manufacturing Director       0.098639
Healthcare Representative    0.089116
Manager                      0.069388
Sales Representative         0.056463
Research Director            0.054422
Human Resources              0.035374
Name: proportion, dtype: float64
****************************************
MaritalStatus
Married     0.457823
Single      0.319728
Divorced    0.222449
Name: proportion, dtype: float64
****************************************
RelationshipSatisfaction
3    0.312245
4    0.293878
2    0.206122
1    0.187755
Name: proportion, dtype: float64
****************************************

for i in cat_cols:
    if i!='Attrition':
        (pd.crosstab(df[i],df['Attrition'],normalize='index')*100).plot(kind='bar',figsize=(8,4),stacked=True)
        plt.ylabel('Percentage Attrition %')

# Mean of numerical variables grouped by attrition
df.groupby(['Attrition'])[num_cols].mean()

# Plotting the correlation between numerical variables
plt.figure(figsize=(15,8))
sns.heatmap(df[num_cols].corr(),annot=True, fmt='0.2f', cmap='YlGnBu')

<Axes: >

# Creating list of dummy columns
to_get_dummies_for = ['BusinessTravel', 'Department','Education', 'EducationField','EnvironmentSatisfaction', 'Gender',  'JobInvolvement','JobLevel', 'JobRole', 'MaritalStatus' ]

# Creating dummy variables
df = pd.get_dummies(data = df, columns = to_get_dummies_for, drop_first = True)

# Mapping overtime and attrition
dict_OverTime = {'Yes': 1, 'No':0}
dict_attrition = {'Yes': 1, 'No': 0}


df['OverTime'] = df.OverTime.map(dict_OverTime)
df['Attrition'] = df.Attrition.map(dict_attrition)

# Separating target variable and other variables

Y= df.Attrition
X= df.drop(columns = ['Attrition'])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1, stratify = Y)

# Scaling the data
sc=StandardScaler()

# Fit_transform on train data
X_train_scaled=sc.fit_transform(X_train)
X_train_scaled=pd.DataFrame(X_train_scaled, columns=X.columns)

# Transform on test data
X_test_scaled=sc.transform(X_test)
X_test_scaled=pd.DataFrame(X_test_scaled, columns=X.columns)

# Creating metric function
def metrics_score(actual, predicted):
    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8,5))

    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels=['Not Attrite', 'Attrite'], yticklabels=['Not Attrite', 'Attrite'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Fitting logistic regression model

lg=LogisticRegression()
lg.fit(X_train_scaled,y_train)

LogisticRegression()

LogisticRegression()

# Checking the performance on the training data
y_pred_train = lg.predict(X_train_scaled)

metrics_score(y_train, y_pred_train)

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1726
           1       0.81      0.50      0.62       332

    accuracy                           0.90      2058
   macro avg       0.86      0.74      0.78      2058
weighted avg       0.89      0.90      0.89      2058

# Checking the performance on the test dataset
y_pred_test = lg.predict(X_test_scaled)

metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.90      0.97      0.93       740
           1       0.73      0.46      0.57       142

    accuracy                           0.89       882
   macro avg       0.82      0.72      0.75       882
weighted avg       0.88      0.89      0.88       882

# Printing the coefficients of logistic regression
cols=X.columns

coef_lg=lg.coef_

pd.DataFrame(coef_lg,columns=cols).T.sort_values(by = 0,ascending = False)

# Finding the odds
odds = np.exp(lg.coef_[0])

# Adding the odds to a dataframe and sorting the values
pd.DataFrame(odds, X_train_scaled.columns, columns = ['odds']).sort_values(by ='odds', ascending = False)

# Predict_proba gives the probability of each observation belonging to each class
y_scores_lg=lg.predict_proba(X_train_scaled)

precisions_lg, recalls_lg, thresholds_lg = precision_recall_curve(y_train, y_scores_lg[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_lg, precisions_lg[:-1], 'b--', label='precision')
plt.plot(thresholds_lg, recalls_lg[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()

optimal_threshold=.35
y_pred_train = lg.predict_proba(X_train_scaled)

metrics_score(y_train, y_pred_train[:,1]>optimal_threshold)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1726
           1       0.65      0.64      0.64       332

    accuracy                           0.89      2058
   macro avg       0.79      0.79      0.79      2058
weighted avg       0.89      0.89      0.89      2058

optimal_threshold=.35
y_pred_test = lg.predict_proba(X_test_scaled)

metrics_score(y_test, y_pred_test[:,1]>optimal_threshold)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       740
           1       0.62      0.63      0.63       142

    accuracy                           0.88       882
   macro avg       0.78      0.78      0.78       882
weighted avg       0.88      0.88      0.88       882

# Fitting SVM
svm = SVC(kernel='linear') # Linear kernal or linear decision boundary
model = svm.fit(X= X_train_scaled, y = y_train)

y_pred_train_svm = model.predict(X_train_scaled)

metrics_score(y_train, y_pred_train_svm)

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1726
           1       0.81      0.50      0.62       332

    accuracy                           0.90      2058
   macro avg       0.86      0.74      0.78      2058
weighted avg       0.89      0.90      0.89      2058

# Checking performance on the test data
y_pred_test_svm = model.predict(X_test_scaled)

metrics_score(y_test, y_pred_test_svm)

              precision    recall  f1-score   support

           0       0.90      0.97      0.94       740
           1       0.76      0.44      0.56       142

    accuracy                           0.89       882
   macro avg       0.83      0.71      0.75       882
weighted avg       0.88      0.89      0.88       882

svm_rbf=SVC(kernel='rbf',probability=True)
# Fit the model
svm_rbf.fit(X_train_scaled,y_train)
# Predict on train data
y_scores_svm=svm_rbf.predict_proba(X_train_scaled)

precisions_svm, recalls_svm, thresholds_svm = precision_recall_curve(y_train, y_scores_svm[:,1])

# Plot values of precisions, recalls, and thresholds
plt.figure(figsize=(10,7))
plt.plot(thresholds_svm, precisions_svm[:-1], 'b--', label='precision')
plt.plot(thresholds_svm, recalls_svm[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.show()

optimal_threshold_svm=.35
y_pred_train = svm_rbf.predict_proba(X_train_scaled)

metrics_score(y_train, y_pred_train[:,1]>optimal_threshold_svm)

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1726
           1       0.93      0.89      0.91       332

    accuracy                           0.97      2058
   macro avg       0.95      0.94      0.95      2058
weighted avg       0.97      0.97      0.97      2058

optimal_threshold_svm=.35
y_pred_test = svm_rbf.predict_proba(X_test_scaled)
metrics_score(y_test, y_pred_test[:,1]>optimal_threshold_svm)

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       740
           1       0.74      0.80      0.77       142

    accuracy                           0.92       882
   macro avg       0.85      0.87      0.86       882
weighted avg       0.93      0.92      0.92       882

# Building decision tree model
dt = DecisionTreeClassifier(class_weight = {0: 0.17, 1: 0.83}, random_state = 1)

# Fitting decision tree model
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight={0: 0.17, 1: 0.83}, random_state=1)

DecisionTreeClassifier(class_weight={0: 0.17, 1: 0.83}, random_state=1)

# Checking performance on the training dataset
y_train_pred_dt = dt.predict(X_train)

metrics_score(y_train, y_train_pred_dt)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1726
           1       1.00      1.00      1.00       332

    accuracy                           1.00      2058
   macro avg       1.00      1.00      1.00      2058
weighted avg       1.00      1.00      1.00      2058

# Checking performance on the test dataset
y_test_pred_dt = dt.predict(X_test)

metrics_score(y_test, y_test_pred_dt)

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       740
           1       0.73      0.80      0.77       142

    accuracy                           0.92       882
   macro avg       0.85      0.87      0.86       882
weighted avg       0.92      0.92      0.92       882

# Plot the feature importance

importances = dt.feature_importances_
columns = X.columns
importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)
plt.figure(figsize = (13, 13))
sns.barplot(data = importance_df, x = importance_df.Importance, y = importance_df.index)

<Axes: xlabel='Importance', ylabel='None'>

features = list(X.columns)

plt.figure(figsize = (30, 20))

tree.plot_tree(dt, max_depth = 4, feature_names = features, filled = True, fontsize = 12, node_ids = True, class_names = True)

plt.show()

# Fitting the Random Forest classifier on the training data
rf_estimator = RandomForestClassifier(class_weight = {0: 0.17, 1: 0.83}, random_state = 1)

rf_estimator.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 0.17, 1: 0.83}, random_state=1)

RandomForestClassifier(class_weight={0: 0.17, 1: 0.83}, random_state=1)

# Checking performance on the training data
y_pred_train_rf = rf_estimator.predict(X_train)

metrics_score(y_train, y_pred_train_rf)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1726
           1       1.00      1.00      1.00       332

    accuracy                           1.00      2058
   macro avg       1.00      1.00      1.00      2058
weighted avg       1.00      1.00      1.00      2058

# Checking performance on the testing data
y_pred_test_rf = rf_estimator.predict(X_test)

metrics_score(y_test, y_pred_test_rf)

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       740
           1       0.95      0.79      0.86       142

    accuracy                           0.96       882
   macro avg       0.95      0.89      0.92       882
weighted avg       0.96      0.96      0.96       882

importances = rf_estimator.feature_importances_
columns = X.columns
importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)
plt.figure(figsize = (13, 13))
sns.barplot(data = importance_df, x = importance_df.Importance, y = importance_df.index)

<Axes: xlabel='Importance', ylabel='None'>

# Choose the type of classifier
dtree_estimator = DecisionTreeClassifier(class_weight = {0: 0.17, 1: 0.83}, random_state = 1)

# Grid of parameters to choose from
parameters = {'max_depth': np.arange(2, 7),
              'criterion': ['gini', 'entropy'],
              'min_samples_leaf': [5, 10, 20, 25]
             }

# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(recall_score, pos_label = 1)

# Run the grid search
gridCV = GridSearchCV(dtree_estimator, parameters, scoring = scorer, cv = 10)

# Fitting the grid search on the train data
gridCV = gridCV.fit(X_train, y_train)

# Set the classifier to the best combination of parameters
dtree_estimator = gridCV.best_estimator_

# Fit the best estimator to the data
dtree_estimator.fit(X_train, y_train)

DecisionTreeClassifier(class_weight={0: 0.17, 1: 0.83}, max_depth=2,
                       min_samples_leaf=5, random_state=1)

DecisionTreeClassifier(class_weight={0: 0.17, 1: 0.83}, max_depth=2,
                       min_samples_leaf=5, random_state=1)

# Checking performance on the training dataset
y_train_pred_dt = dtree_estimator.predict(X_train)

metrics_score(y_train, y_train_pred_dt)

              precision    recall  f1-score   support

           0       0.92      0.72      0.81      1726
           1       0.32      0.68      0.43       332

    accuracy                           0.71      2058
   macro avg       0.62      0.70      0.62      2058
weighted avg       0.82      0.71      0.75      2058

# Checking performance on the test dataset
y_test_pred_dt = dtree_estimator.predict(X_test)

metrics_score(y_test, y_test_pred_dt)

              precision    recall  f1-score   support

           0       0.91      0.71      0.80       740
           1       0.29      0.61      0.39       142

    accuracy                           0.70       882
   macro avg       0.60      0.66      0.59       882
weighted avg       0.81      0.70      0.73       882

importances = dtree_estimator.feature_importances_
columns = X.columns

importance_df = pd.DataFrame(importances, index=columns, columns=['Importance']).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(13, 13))
sns.barplot(data=importance_df, x='Importance', y=importance_df.index)
plt.show()

# Choose the type of classifier
rf_estimator_tuned = RandomForestClassifier(class_weight = {0: 0.17, 1: 0.83}, random_state = 1)

# Grid of parameters to choose from
params_rf = {
        "n_estimators": [100, 250, 500],
        "min_samples_leaf": np.arange(1, 4, 1),
        "max_features": [0.7, 0.9, 'auto'],
}


# Type of scoring used to compare parameter combinations - recall score for class 1
scorer = metrics.make_scorer(recall_score, pos_label = 1)

# Run the grid search
grid_obj = GridSearchCV(rf_estimator_tuned, params_rf, scoring = scorer, cv = 5)

grid_obj = grid_obj.fit(X_train, y_train)

# Set the classifier to the best combination of parameters
rf_estimator_tuned = grid_obj.best_estimator_

rf_estimator_tuned.fit(X_train, y_train)

RandomForestClassifier(class_weight={0: 0.17, 1: 0.83}, max_features=0.9,
                       min_samples_leaf=3, n_estimators=250, random_state=1)

RandomForestClassifier(class_weight={0: 0.17, 1: 0.83}, max_features=0.9,
                       min_samples_leaf=3, n_estimators=250, random_state=1)

# Checking performance on the training data
y_pred_train_rf_tuned = rf_estimator_tuned.predict(X_train)

metrics_score(y_train, y_pred_train_rf_tuned)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1726
           1       0.99      1.00      1.00       332

    accuracy                           1.00      2058
   macro avg       1.00      1.00      1.00      2058
weighted avg       1.00      1.00      1.00      2058

# Checking performance on the test data
y_pred_test_rf_tuned = rf_estimator_tuned.predict(X_test)

metrics_score(y_test, y_pred_test_rf_tuned)

              precision    recall  f1-score   support

           0       0.97      0.98      0.97       740
           1       0.89      0.82      0.85       142

    accuracy                           0.95       882
   macro avg       0.93      0.90      0.91       882
weighted avg       0.95      0.95      0.95       882

# Plotting feature importance
importances = rf_estimator_tuned.feature_importances_
columns = X.columns
importance_df = pd.DataFrame(importances, index = columns, columns = ['Importance']).sort_values(by = 'Importance', ascending = False)
plt.figure(figsize = (13, 13))
sns.barplot(data = importance_df, x = importance_df.Importance, y = importance_df.index)

<Axes: xlabel='Importance', ylabel='None'>

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Mentored_Learning_Session/HR_Employee_Attrition_Prediction_(Classification)/Case_Study_Employee_Attrition.ipynb"

	count	mean	std	min	25%	50%	75%	max
DailyRate	2940.0	802.485714	403.440447	102.0	465.0	802.0	1157.0	1499.0
Age	2940.0	36.923810	9.133819	18.0	30.0	36.0	43.0	60.0
DistanceFromHome	2940.0	9.192517	8.105485	1.0	2.0	7.0	14.0	29.0
MonthlyIncome	2940.0	6502.931293	4707.155770	1009.0	2911.0	4919.0	8380.0	19999.0
MonthlyRate	2940.0	14313.103401	7116.575021	2094.0	8045.0	14235.5	20462.0	26999.0
PercentSalaryHike	2940.0	15.209524	3.659315	11.0	12.0	14.0	18.0	25.0
TotalWorkingYears	2940.0	11.279592	7.779458	0.0	6.0	10.0	15.0	40.0
YearsAtCompany	2940.0	7.008163	6.125483	0.0	3.0	5.0	9.0	40.0
NumCompaniesWorked	2940.0	2.693197	2.497584	0.0	1.0	2.0	4.0	9.0
HourlyRate	2940.0	65.891156	20.325969	30.0	48.0	66.0	84.0	100.0
YearsInCurrentRole	2940.0	4.229252	3.622521	0.0	2.0	3.0	7.0	18.0
YearsSinceLastPromotion	2940.0	2.187755	3.221882	0.0	0.0	1.0	3.0	15.0
YearsWithCurrManager	2940.0	4.123129	3.567529	0.0	2.0	3.0	7.0	17.0
TrainingTimesLastYear	2940.0	2.799320	1.289051	0.0	2.0	3.0	3.0	6.0

	DailyRate	Age	DistanceFromHome	MonthlyIncome	MonthlyRate	PercentSalaryHike	TotalWorkingYears	YearsAtCompany	NumCompaniesWorked	HourlyRate	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager	TrainingTimesLastYear
Attrition
No	812.504461	37.561233	8.915653	6832.739659	14265.779400	15.231144	11.862936	7.369019	2.645580	65.952149	4.484185	2.234388	4.367397	2.832928
Yes	750.362869	33.607595	10.632911	4787.092827	14559.308017	15.097046	8.244726	5.130802	2.940928	65.573840	2.902954	1.945148	2.852321	2.624473

	0
OverTime	0.959813
BusinessTravel_Travel_Frequently	0.715594
MaritalStatus_Single	0.612941
YearsSinceLastPromotion	0.548358
YearsAtCompany	0.518351
NumCompaniesWorked	0.499555
Department_Research & Development	0.443697
BusinessTravel_Travel_Rarely	0.440205
Department_Sales	0.436376
JobRole_Sales Executive	0.397936
DistanceFromHome	0.385084
MaritalStatus_Married	0.286715
JobLevel_5	0.279558
JobRole_Human Resources	0.264813
JobRole_Laboratory Technician	0.187027
JobRole_Sales Representative	0.185754
Gender_Male	0.165196
Education_3	0.158683
Education_2	0.130788
JobRole_Manufacturing Director	0.113799
Education_4	0.113649
Education_5	0.092047
EducationField_Technical Degree	0.074374
MonthlyRate	0.059012
HourlyRate	0.048250
JobLevel_3	0.015339
EducationField_Marketing	-0.023921
JobRole_Manager	-0.030842
PerformanceRating	-0.033681
PercentSalaryHike	-0.073709
DailyRate	-0.095693
StockOptionLevel	-0.111651
EducationField_Other	-0.141616
JobLevel_4	-0.150793
WorkLifeBalance	-0.212193
JobRole_Research Scientist	-0.224776
TrainingTimesLastYear	-0.242203
Age	-0.272715
RelationshipSatisfaction	-0.312702
EducationField_Life Sciences	-0.334055
JobRole_Research Director	-0.349921
EducationField_Medical	-0.367708
JobSatisfaction	-0.374684
YearsWithCurrManager	-0.382831
YearsInCurrentRole	-0.429374
EnvironmentSatisfaction_2	-0.450471
JobInvolvement_2	-0.478873
EnvironmentSatisfaction_3	-0.496994
TotalWorkingYears	-0.504693
MonthlyIncome	-0.602496
EnvironmentSatisfaction_4	-0.650321
JobInvolvement_4	-0.652201
JobLevel_2	-0.705429
JobInvolvement_3	-0.746196

	odds
OverTime	2.611209
BusinessTravel_Travel_Frequently	2.045400
MaritalStatus_Single	1.845852
YearsSinceLastPromotion	1.730410
YearsAtCompany	1.679255
NumCompaniesWorked	1.647989
Department_Research & Development	1.558459
BusinessTravel_Travel_Rarely	1.553026
Department_Sales	1.547090
JobRole_Sales Executive	1.488748
DistanceFromHome	1.469737
MaritalStatus_Married	1.332045
JobLevel_5	1.322545
JobRole_Human Resources	1.303188
JobRole_Laboratory Technician	1.205660
JobRole_Sales Representative	1.204126
Gender_Male	1.179624
Education_3	1.171966
Education_2	1.139726
JobRole_Manufacturing Director	1.120527
Education_4	1.120359
Education_5	1.096416
EducationField_Technical Degree	1.077210
MonthlyRate	1.060788
HourlyRate	1.049433
JobLevel_3	1.015457
EducationField_Marketing	0.976363
JobRole_Manager	0.969629
PerformanceRating	0.966880
PercentSalaryHike	0.928942
DailyRate	0.908743
StockOptionLevel	0.894356
EducationField_Other	0.867955
JobLevel_4	0.860025
WorkLifeBalance	0.808809
JobRole_Research Scientist	0.798695
TrainingTimesLastYear	0.784896
Age	0.761310
RelationshipSatisfaction	0.731468
EducationField_Life Sciences	0.716014
JobRole_Research Director	0.704744
EducationField_Medical	0.692320
JobSatisfaction	0.687506
YearsWithCurrManager	0.681928
YearsInCurrentRole	0.650916
EnvironmentSatisfaction_2	0.637328
JobInvolvement_2	0.619481
EnvironmentSatisfaction_3	0.608357
TotalWorkingYears	0.603691
MonthlyIncome	0.547444
EnvironmentSatisfaction_4	0.521878
JobInvolvement_4	0.520898
JobLevel_2	0.493897
JobInvolvement_3	0.474167

Case Study - Employee Attrition Prediction¶

Context¶

Objective¶

Dataset Description¶

Importing the libraries and overview of the dataset¶

Loading the Dataset¶

Checking the info of the dataset¶

Exploratory Data Analysis and Data Preprocessing¶

Univariate analysis of numerical columns¶

Univariate analysis for categorical variables¶

Bivariate and Multivariate analysis¶

Let's check the relationship between different numerical variables¶

Model Building - Approach¶

Data preparation¶

Scaling the data¶

Model evaluation criterion¶

Building the model¶

Logistic Regression Model¶

Precision-Recall Curve for logistic regression¶

Support Vector Machines¶

Linear Kernel¶

RBF Kernel¶

Decision Tree¶

Note:¶

Random Forest¶

Conclusions:¶

Recommendations:¶

Additional Content - Hyperparameter Tuning¶

Types of Hyperparameter Tuning¶

Grid Search¶

Random Search¶

Tuning Models¶

Decision Tree¶

Random Forest¶

	EmployeeNumber	Attrition	Age	BusinessTravel	DailyRate	Department	DistanceFromHome	Education	EducationField	EnvironmentSatisfaction	...	RelationshipSatisfaction	StandardHours	StockOptionLevel	TotalWorkingYears	TrainingTimesLastYear	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsSinceLastPromotion	YearsWithCurrManager
0	1	Yes	41	Travel_Rarely	1102	Sales	1	2	Life Sciences	2	...	1	80	0	8	0	1	6	4	0	5
1	2	No	49	Travel_Frequently	279	Research & Development	8	1	Life Sciences	3	...	4	80	1	10	3	3	10	7	1	7
2	3	Yes	37	Travel_Rarely	1373	Research & Development	2	2	Other	4	...	2	80	0	7	3	3	0	0	0	0
3	4	No	33	Travel_Frequently	1392	Research & Development	3	4	Life Sciences	4	...	3	80	0	8	3	3	8	7	3	0
4	5	No	27	Travel_Rarely	591	Research & Development	2	1	Medical	1	...	4	80	1	6	3	3	2	2	2	2

	0
EmployeeNumber	2940
Attrition	2
Age	43
BusinessTravel	3
DailyRate	886
Department	3
DistanceFromHome	29
Education	5
EducationField	6
EnvironmentSatisfaction	4
Gender	2
HourlyRate	71
JobInvolvement	4
JobLevel	5
JobRole	9
JobSatisfaction	4
MaritalStatus	3
MonthlyIncome	1349
MonthlyRate	1427
NumCompaniesWorked	10
Over18	1
OverTime	2
PercentSalaryHike	15
PerformanceRating	2
RelationshipSatisfaction	4
StandardHours	1
StockOptionLevel	4
TotalWorkingYears	40
TrainingTimesLastYear	7
WorkLifeBalance	4
YearsAtCompany	37
YearsInCurrentRole	19
YearsSinceLastPromotion	16
YearsWithCurrManager	18