# Importing the basic libraries we will require for the project

# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches # For creating plot independent legends

# Libraries for statistical analysis
from scipy import stats

# Library for label encoding (for 3D plotting functions)
from Scikit-learn.preprocessing import LabelEncoder

# Collinearity checks
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Importing the Machine Learning models we require from Scikit-Learn
from Scikit-learn.linear_model import LogisticRegression
from Scikit-learn.svm import SVC
from Scikit-learn import tree

from Scikit-learn.tree import (
    DecisionTreeClassifier,
)

from Scikit-learn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
)

# Importing the other functions we may require from Scikit-Learn
from Scikit-learn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
)

from Scikit-learn.preprocessing import (
    MinMaxScaler,
    LabelEncoder,
    OneHotEncoder,
    StandardScaler,
)

from Scikit-learn.impute import (
    SimpleImputer,
)

# To get diferent metric scores
import Scikit-learn.metrics as metrics
from Scikit-learn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    roc_curve,
    make_scorer,
)

#Importing PCA and TSNE
from Scikit-learn.decomposition import PCA

# Importing class weights
from Scikit-learn.utils.class_weight import compute_class_weight

# Importing Advanced Analysis Libraries
from xgboost import XGBClassifier
from Scikit-learn.model_selection import RandomizedSearchCV

# Code to ignore warnings from function usage
import warnings;
import numpy as np
warnings.filterwarnings('ignore')

# Comment formatting
from IPython.display import display, HTML

# Connect collab
from google.colab import drive
drive.mount('/content/drive')

# Load data from csv file
dataset = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Project_-_Classification_and_Hypothesis_Testing/ExtraaLearn.csv')

# Make a working copy of the data
data = dataset.copy()

Mounted at /content/drive


def enhanced_histogram_boxplot(
    data,
    feature,
    figsize=(12, 8),
    kde=True,
    bins=None,
    box_color="violet",
    mean_color="green",
    median_color="black",
    hist_palette="tab10",  # Updated palette to 'tab10'
    show_title=True,
    custom_title=None
):
    """
    Enhanced boxplot and histogram combined with outlier detection and statistical summary

    Parameters:
    data (pd.DataFrame): Input dataframe
    feature (str): Column name of the dataframe to plot
    figsize (tuple): Size of figure (default (12,8)) - Adjusted for removal of Q-Q plot
    kde (bool): Whether to show the density curve (default True)
    bins (int): Number of bins for histogram (default None)
    box_color (str): Color of the boxplot (default "violet")
    mean_color (str): Color of the mean line (default "green")
    median_color (str): Color of the median line (default "black")
    hist_palette (str): Color palette for the histogram (default "tab10")
    show_title (bool): Whether to show the plot title (default True)
    custom_title (str): Custom title for the plot (default None)
    """
    if not isinstance(data, pd.DataFrame):
        raise TypeError("data must be a pandas DataFrame")

    if feature not in data.columns:
        raise ValueError(f"Feature '{feature}' not found in the dataframe")

    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,
        sharex=False,
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )

    # Boxplot
    sns.boxplot(data=data, x=feature, ax=ax_box2, showmeans=True, color=box_color)

    # Histogram
    if bins:
        sns.histplot(data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette=sns.color_palette(hist_palette))
    else:
        sns.histplot(data=data, x=feature, kde=kde, ax=ax_hist2, palette=sns.color_palette(hist_palette))

    # Add mean and median lines
    mean = data[feature].mean()
    median = data[feature].median()
    ax_hist2.axvline(mean, color=mean_color, linestyle="--", label="Mean")
    ax_hist2.axvline(median, color=median_color, linestyle="-", label="Median")

    # Add legend
    ax_hist2.legend()

    # Add labels
    ax_hist2.set_xlabel(feature)
    ax_hist2.set_ylabel("Count")
    ax_box2.set_ylabel("")

    if show_title:
        if custom_title:
            title = f"{custom_title} Distribution"
        else:
            title = f"{feature} Distribution"
        plt.suptitle(title, fontsize=16)

    plt.tight_layout()

    # Calculate statistics
    std = data[feature].std()
    skew = data[feature].skew()
    kurtosis = data[feature].kurtosis()

    # Outlier detection
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[feature] < lower_bound) | (data[feature] > upper_bound)][feature]

    # Z-score outliers
    z_scores = np.abs(stats.zscore(data[feature]))
    z_score_outliers = data[feature][z_scores > 3]

    # Print statistical summary
    print(f"\nStatistical Summary for {feature}:")
    print(f"Mean: {mean:.2f}")
    print(f"Median: {median:.2f}")
    print(f"Standard Deviation: {std:.2f}")
    print(f"Skewness: {skew:.2f}")
    print(f"Kurtosis: {kurtosis:.2f}")
    print(f"\nOutlier Analysis:")
    print(f"IQR method - Number of outliers: {len(outliers)}")
    print(f"IQR method - Percentage of outliers: {(len(outliers) / len(data[feature])) * 100:.2f}%")
    print(f"IQR method - Outlier range: < {lower_bound:.2f} or > {upper_bound:.2f}")
    print(f"Z-score method - Number of outliers (|z| > 3): {len(z_score_outliers)}")
    print(f"Z-score method - Percentage of outliers: {(len(z_score_outliers) / len(data[feature])) * 100:.2f}%")

    plt.show()

# Usage example:
# enhanced_histogram_boxplot(data, 'age')


# Define binary feature plotting function

def plot_binary_feature(
                            data,
                            feature,
                            figsize=(12, 7),
                            colors=['#0073a3', '#5e6d77'],
                            show_title=True,
                            custom_title=None):
    """
    Bar plot and pie chart for binary features

    Parameters:
    data (pd.DataFrame): Input dataframe
    feature (str): Column name of the dataframe to plot
    figsize (tuple): Size of figure (default (12,7))
    colors (list): List of two colors for the plots (default ['#FFA07A', '#98FB98'])
    show_title (bool): Whether to show the plot title (default True)
    custom_title (str): Custom title for the plot (default None)
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)

    # Calculate the value counts and percentages
    value_counts = data[feature].value_counts().sort_index()
    percentages = value_counts / len(data) * 100

    # Bar plot
    sns.barplot(x=value_counts.index, y=value_counts.values, ax=ax1, palette=colors)
    ax1.set_title('Bar Plot')
    ax1.set_xlabel(feature)
    ax1.set_ylabel('Count')

    # Add percentage labels on the bars
    for i, v in enumerate(value_counts.values):
        ax1.text(i, v, f'{percentages[i]:.1f}%', ha='center', va='bottom')

    # Pie chart
    ax2.pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%', colors=colors, startangle=90)
    ax2.set_title('Pie Chart')

    if show_title:
        title = f"{custom_title} Distribution" if custom_title else f"{feature} Distribution"
        plt.suptitle(title, fontsize=16)

    plt.tight_layout()
    plt.show()


# Define categorical feature plotting function

def plot_categorical(data, feature, figsize=(12, 6), show_title=True, custom_title=None, top_n=None):
    """
    Plot categorical variables with appropriate chart types based on the number of categories.
    Includes value counts and percentages as a "legend" on the right side.

    Parameters:
    data (pd.DataFrame): Input dataframe
    feature (str): Column name of the dataframe to plot
    figsize (tuple): Size of figure (default (12,6))
    show_title (bool): Whether to show the plot title (default True)
    custom_title (str): Custom title for the plot (default None)
    top_n (int): Number of top categories to show, others will be grouped as 'Other' (default None)
    """
    if not isinstance(data, pd.DataFrame):
        raise TypeError("data must be a pandas DataFrame")

    if feature not in data.columns:
        raise ValueError(f"Feature '{feature}' not found in the dataframe")

    value_counts = data[feature].value_counts()
    n_categories = len(value_counts)

    if top_n and n_categories > top_n:
        top_values = value_counts.nlargest(top_n)
        other = pd.Series({'Other': value_counts.nsmallest(n_categories - top_n).sum()})
        value_counts = pd.concat([top_values, other])
        n_categories = top_n + 1

    percentages = value_counts / len(data) * 100

    # Determine the title
    title = custom_title if custom_title else f"{feature} Distribution"

    if n_categories == 2:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize, gridspec_kw={'width_ratios': [2, 1]})

        # Bar plot
        sns.barplot(x=value_counts.index, y=value_counts.values, ax=ax1)
        ax1.set_title('Bar Plot')
        ax1.set_ylabel('Count')

        # Add percentage labels on the bars
        for i, v in enumerate(value_counts.values):
            ax1.text(i, v, f'{percentages[i]:.1f}%', ha='center', va='bottom')

        # Pie chart
        ax2.pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%', startangle=90)
        ax2.set_title('Pie Chart')

    else:
        fig, (ax, ax_legend) = plt.subplots(1, 2, figsize=figsize, gridspec_kw={'width_ratios': [3, 1]})

        # Horizontal bar plot
        bars = ax.barh(value_counts.index, value_counts.values)
        ax.set_title('Horizontal Bar Plot')
        ax.set_xlabel('Count')

        # Add percentage labels on the bars
        for i, (value, name) in enumerate(zip(value_counts.values, value_counts.index)):
            ax.text(value, i, f'{percentages[name]:.1f}%', va='center')

        # Add value counts as "legend"
        ax_legend.axis('off')
        legend_text = "Value Counts:\n\n"
        for index, value in value_counts.items():
            legend_text += f"{index}: {value} ({percentages[index]:.1f}%)\n"
        ax_legend.text(0, 0.9, legend_text, verticalalignment='top', wrap=True)

    if show_title:
        plt.suptitle(title, fontsize=16)

    plt.tight_layout()
    plt.show()

# Example usage:
# plot_categorical(data, 'current_occupation', custom_title="Occupation Distribution")
# plot_categorical(data, 'print_media_type1', custom_title="Print Media Type 1 Usage")
# plot_categorical(data, 'educational_channels', top_n=5, custom_title="Top 5 Educational Channels")


# Define scatter plot function

def plot_scatter(x_column, y_column, data, title=None, x_label=None, y_label=None, palette='tab10'):
    """
    Creates a scatter plot for the specified x and y columns from the given dataset.

    Parameters:
    x_column (str): The name of the column for the x-axis.
    y_column (str): The name of the column for the y-axis.
    data (DataFrame): The pandas DataFrame containing the data.
    title (str, optional): The title of the plot. Defaults to None.
    x_label (str, optional): Label for the x-axis. Defaults to None.
    y_label (str, optional): Label for the y-axis. Defaults to None.

    Returns:
    None: Displays the scatter plot.
    """
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=x_column, y=y_column, data=data, palette='palette')
    plt.title(title if title else f'Scatter Plot: {x_column} vs {y_column}')
    plt.xlabel(x_label if x_label else x_column)
    plt.ylabel(y_label if y_label else y_column)
    plt.grid(True)
    plt.show()

# Example of how to call the function
#plot_scatter('age', 'website_visits', data, title='Age vs Website Visits')


# Defining an abstracted function for box plot visualization

def plot_boxplot(x_column, y_column, data, title=None, x_label=None, y_label=None, palette='tab10', figsize=(10, 6)):
    """
    Creates a box plot for the specified x and y columns from the given dataset.

    Parameters:
    x_column (str): The name of the column for the x-axis (typically categorical).
    y_column (str): The name of the column for the y-axis (typically continuous).
    data (DataFrame): The pandas DataFrame containing the data.
    title (str, optional): The title of the plot. Defaults to None.
    x_label (str, optional): Label for the x-axis. Defaults to None.
    y_label (str, optional): Label for the y-axis. Defaults to None.
    palette (str, optional): Color palette for the box plot. Defaults to 'tab10'.
    figsize (tuple, optional): Figure size for the plot (width, height). Defaults to (10, 6).

    Returns:
    None: Displays the box plot.
    """
    # Set the figure size based on the figsize parameter
    plt.figure(figsize=figsize)
    sns.boxplot(x=x_column, y=y_column, data=data, palette=palette)
    plt.title(title if title else f'Box Plot: {x_column} vs {y_column}')
    plt.xlabel(x_label if x_label else x_column)
    plt.ylabel(y_label if y_label else y_column)
    plt.grid(True)
    plt.show()

# Example of how to call the function with a custom figure size
# plot_boxplot(x_column='status', y_column='age', data=dataset, title='Status vs Age', figsize=(15, 8))


# Defining a generic function for count plot visualization

def plot_countplot(x_column, hue_column, data, title=None, x_label=None, y_label=None, palette='tab10', hue_order=None):
    """
    Creates a count plot for the specified x and hue columns from the given dataset.

    Parameters:
    x_column (str): The name of the column for the x-axis (typically categorical).
    hue_column (str): The name of the column for hue (typically categorical).
    data (DataFrame): The pandas DataFrame containing the data.
    title (str, optional): The title of the plot. Defaults to None.
    x_label (str, optional): Label for the x-axis. Defaults to None.
    y_label (str, optional): Label for the y-axis. Defaults to None.
    palette (str, optional): Color palette for the count plot. Defaults to 'tab10'.
    hue_order (list, optional): The order of the hues to be plotted. Defaults to None.

    Returns:
    None: Displays the count plot.
    """
    plt.figure(figsize=(10, 6))
    sns.countplot(x=x_column, hue=hue_column, data=data, palette=palette, hue_order=hue_order)
    plt.title(title if title else f'Count Plot: {x_column} vs {hue_column}')
    plt.xlabel(x_label if x_label else x_column)
    plt.ylabel(y_label if y_label else 'Count')
    plt.grid(True)
    plt.show()

# Example usage
# plot_countplot(x_column='status', hue_column='current_occupation', data=dataset, title='Status vs Current Occupation', hue_order=['Employed', 'Unemployed', 'Student'])


# Defining a generic function for creating 3D scatter plots

# Set Seaborn style to 'darkgrid'
sns.set(style="whitegrid")

def plot_3d_scatter_with_color(x_column, y_column, z_column, color_column, data, title=None, x_label=None, y_label=None, z_label=None, figsize=(10, 8)):
    """
    Creates a 3D scatter plot for the specified x, y, and z columns from the given dataset.
    Adds the 'color_column' to color the points based on a categorical variable.
    """
    fig = plt.figure(figsize=figsize)           # Pass figsize argument here
    ax = fig.add_subplot(111, projection='3d')

    # Scatter plot with color dimension based on the 'color_column'
    p = ax.scatter(data[x_column], data[y_column], data[z_column], c=data[color_column], cmap='coolwarm', marker='o')

    # Setting labels with padding
    ax.set_xlabel(x_label if x_label else x_column, labelpad=20)
    ax.set_ylabel(y_label if y_label else y_column, labelpad=20)
    ax.set_zlabel(z_label if z_label else z_column, labelpad=20)

    # Add color bar for reference
    cbar = fig.colorbar(p, ax=ax) # Assign color bar object to cbar
    cbar.set_label('Status') # Add this line

    # Title
    ax.set_title(title if title else f'3D Scatter Plot: {x_column}, {y_column}, {z_column}')

    plt.tight_layout()
    plt.show()

# Call the modified function with 'status' as the color dimension
#plot_3d_scatter_with_color('website_visits', 'time_spent_on_website', 'status', 'status', data, title='Website Visits, Time Spent on Website, and Status (with color dimension)')


# Creating metric function for regresion model evaluation

def metrics_score(actual, predicted):
    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(8,5))

    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels=['Not Converted', 'Converted'], yticklabels=['Not Converted', 'Converted'])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()


# returns the first 5 rows
data.head()


# returns the last 5 rows
data.tail()


# Determine the number of rows and columns by calling data.shape
print(data.shape[0])
print(data.shape[1])

4612
15


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4612 entries, 0 to 4611
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     4612 non-null   object 
 1   age                    4612 non-null   int64  
 2   current_occupation     4612 non-null   object 
 3   first_interaction      4612 non-null   object 
 4   profile_completed      4612 non-null   object 
 5   website_visits         4612 non-null   int64  
 6   time_spent_on_website  4612 non-null   int64  
 7   page_views_per_visit   4612 non-null   float64
 8   last_activity          4612 non-null   object 
 9   print_media_type1      4612 non-null   object 
 10  print_media_type2      4612 non-null   object 
 11  digital_media          4612 non-null   object 
 12  educational_channels   4612 non-null   object 
 13  referral               4612 non-null   object 
 14  status                 4612 non-null   int64  
dtypes: float64(1), int64(4), object(10)
memory usage: 540.6+ KB


# Changing datatypes of categorical features

categorical_features = ['current_occupation', 'first_interaction', 'profile_completed', 'last_activity',
                        'print_media_type1', 'print_media_type2', 'digital_media', 'educational_channels',
                        'referral', 'status']

for feature in categorical_features:
    data[feature] = data[feature].astype('category')  # Step 1: Convert to categorical
    data[feature] = data[feature].cat.codes           # Step 2: Encode categories as integer codes
    data[feature] = data[feature].astype('category')  # Step 3: Re-convert back to categorical


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4612 entries, 0 to 4611
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   ID                     4612 non-null   object  
 1   age                    4612 non-null   int64   
 2   current_occupation     4612 non-null   category
 3   first_interaction      4612 non-null   category
 4   profile_completed      4612 non-null   category
 5   website_visits         4612 non-null   int64   
 6   time_spent_on_website  4612 non-null   int64   
 7   page_views_per_visit   4612 non-null   float64 
 8   last_activity          4612 non-null   category
 9   print_media_type1      4612 non-null   category
 10  print_media_type2      4612 non-null   category
 11  digital_media          4612 non-null   category
 12  educational_channels   4612 non-null   category
 13  referral               4612 non-null   category
 14  status                 4612 non-null   category
dtypes: category(10), float64(1), int64(3), object(1)
memory usage: 226.1+ KB


data.describe(include = "all").T


# Get nunique values for each column
nunique_df = data.nunique().reset_index()
nunique_df.columns = ['Feature', 'Unique_Count']

# Features to find unique values for (categorical only)
categorical_features = [
    'current_occupation', 'first_interaction', 'profile_completed',
    'last_activity', 'print_media_type1', 'print_media_type2',
    'digital_media', 'educational_channels', 'referral', 'status'
]

# Create an empty list to store unique value strings
unique_value_list = []

# Loop over the categorical list to get all unique values and store them as formatted strings
for feature in categorical_features:
    unique_values = data[feature].dropna().unique()  # Ensure no NaN values are included
    unique_values_sorted = sorted(unique_values, key=lambda x: (str(x).lower()))  # Sort for consistency
    formatted_list = ', '.join([str(value) for value in unique_values_sorted])
    unique_value_list.append(formatted_list)

# Create a DataFrame with the features and their corresponding unique values
unique_values_df = pd.DataFrame({
    'Feature': categorical_features,
    'Unique_Values': unique_value_list
})

# Merge nunique_df with unique_values_df
merged_df = pd.merge(nunique_df, unique_values_df, on='Feature', how='left')

# Display the combined DataFrame
display(merged_df)


# Missing value check
pd.DataFrame(data={'% of Missing Values':round(data.isna().sum()/data.isna().count()*100,2)})


# plot the distribution of the age feature
enhanced_histogram_boxplot(data, 'age', kde = True, bins = 24, custom_title = "Age")

Statistical Summary for age:
Mean: 46.20
Median: 51.00
Standard Deviation: 13.16
Skewness: -0.72
Kurtosis: -0.80

Outlier Analysis:
IQR method - Number of outliers: 0
IQR method - Percentage of outliers: 0.00%
IQR method - Outlier range: < 4.50 or > 88.50
Z-score method - Number of outliers (|z| > 3): 0
Z-score method - Percentage of outliers: 0.00%


# plot the distribution of the website_visits feature
enhanced_histogram_boxplot(data, "website_visits", kde = True, custom_title = "Website Visits")

Statistical Summary for website_visits:
Mean: 3.57
Median: 3.00
Standard Deviation: 2.83
Skewness: 2.16
Kurtosis: 9.35

Outlier Analysis:
IQR method - Number of outliers: 154
IQR method - Percentage of outliers: 3.34%
IQR method - Outlier range: < -2.50 or > 9.50
Z-score method - Number of outliers (|z| > 3): 66
Z-score method - Percentage of outliers: 1.43%


# plot the distribution of the page_views_per_visit feature
enhanced_histogram_boxplot(data, "page_views_per_visit", kde = True, custom_title = "Page Views per Visit")

Statistical Summary for page_views_per_visit:
Mean: 3.03
Median: 2.79
Standard Deviation: 1.97
Skewness: 1.27
Kurtosis: 4.22

Outlier Analysis:
IQR method - Number of outliers: 257
IQR method - Percentage of outliers: 5.57%
IQR method - Outlier range: < -0.44 or > 6.27
Z-score method - Number of outliers (|z| > 3): 40
Z-score method - Percentage of outliers: 0.87%


# plot the distribution of the time_spent_on_website feature
enhanced_histogram_boxplot(data, "time_spent_on_website", kde = True, custom_title = "Time Spent on Website")

Statistical Summary for time_spent_on_website:
Mean: 724.01
Median: 376.00
Standard Deviation: 743.83
Skewness: 0.95
Kurtosis: -0.58

Outlier Analysis:
IQR method - Number of outliers: 0
IQR method - Percentage of outliers: 0.00%
IQR method - Outlier range: < -1633.25 or > 3118.75
Z-score method - Number of outliers (|z| > 3): 0
Z-score method - Percentage of outliers: 0.00%


# plot the distribution of the status feature
plot_binary_feature(data, "status", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="Status")


# plot the distribution of the current_occupation feature
plot_binary_feature(data, "current_occupation", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="Status")


# plot the distribution of the first_interaction feature
plot_binary_feature(data, "first_interaction", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="First Interaction")


# plot the distribution of the profile_completed feature
plot_binary_feature(data, "profile_completed", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="Profile Completed")


# plot the distribution of the last_activity feature
plot_binary_feature(data, "last_activity", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="Last Activity")


# plot the distribution of the print_media_type1 feature
plot_binary_feature(data, "print_media_type1", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="Print Media Type 1")


# plot the distribution of the print_media_type2 feature
plot_binary_feature(data, "print_media_type2", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="Print Media Type 2")


# plot the distribution of the digital_media feature
plot_binary_feature(data, "digital_media", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="Digital Media")


# plot the distribution of the educational_channels feature
plot_binary_feature(data, "educational_channels", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="Educational Channels")


# plot the distribution of the referral feature
plot_binary_feature(data, "referral", colors=['#0073a3', '#5e6d77'], show_title=True, custom_title="Referral")


# Plot status counts

plt.figure(figsize = (10, 6))
ax = sns.countplot(x = 'status', data = data, palette='tab10')

# Annotating the exact count on the top of the bar for each category
for p in ax.patches:
    ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x(), p.get_height()+ 0.35))


# Calculate conversion percentages
conversion_percentages = round(data['status'].value_counts(normalize=True) * 100, 2)
print(conversion_percentages)

status
0    70.14
1    29.86
Name: proportion, dtype: float64


# Group by 'current_occupation' and 'status' to get the counts for each occupation
occupation_counts = data.groupby(['current_occupation', 'status']).size().unstack()

# Define the correct x-axis labels
occupation_labels = ['Professional', 'Student', 'Unemployed']

# Create the figure for the stacked bar plot with proper labels
plt.figure(figsize=(10, 6))

# Plotting status 0 at the bottom and status 1 on top for each occupation with overridden x-axis labels
plt.bar(occupation_labels, occupation_counts[0], label='0', color='blue')
plt.bar(occupation_labels, occupation_counts[1],
        bottom=occupation_counts[0], label='1', color='orange')

# Adding labels and title
plt.title('Lead Status by Current Occupation')
plt.xlabel('Current Occupation')
plt.ylabel('Count of Leads')
plt.legend(title='Lead Status')

# Adding annotations for each bar (status 0 and status 1 counts)
for i, occupation in enumerate(occupation_labels):
    plt.text(i, occupation_counts[0][i] / 2, f'{occupation_counts[0][i]}', ha='center', color='white')
    plt.text(i, occupation_counts[0][i] + (occupation_counts[1][i] / 2), f'{occupation_counts[1][i]}', ha='center', color='black')

plt.tight_layout()
plt.show()


# Group the data by first_interaction and status, and calculate the counts
interaction_counts = data.groupby(['first_interaction', 'status']).size().unstack()

# Create a stacked bar plot
ax = interaction_counts.plot(kind='bar', stacked=True, figsize=(10, 6))

plt.title('Lead Status by First Interaction Channel')
plt.xlabel('First Interaction Channel')
plt.ylabel('Count of Leads')

# Override x-axis labels explicitly with desired labels
ax.set_xticks([0, 1])  # Set positions for the ticks (number of unique values in 'first_interaction')
ax.set_xticklabels(['Mobile App', 'Website'], rotation=45)  # Manually set labels

# Add the legend with proper labels for lead status
plt.legend(title='Lead Status', labels=['0', '1'])

# Add annotations on top of each bar
for container in ax.containers:
    ax.bar_label(container, label_type='center')

plt.tight_layout()

# Show the plot
plt.show()


# Group by 'current_occupation' and 'status' to get the counts for each occupation
interaction_counts = data.groupby(['last_activity', 'status']).size().unstack()

# Define the correct x-axis labels
occupation_labels = ['Website Activity', 'Email Activity', 'Phone Activity']

# Create the figure for the stacked bar plot with proper labels
plt.figure(figsize=(10, 6))

# Plotting status 0 at the bottom and status 1 on top for each occupation with overridden x-axis labels
plt.bar(occupation_labels, interaction_counts[0], label='0', color='blue')
plt.bar(occupation_labels, interaction_counts[1],
        bottom=interaction_counts[0], label='1', color='orange')

# Adding labels and title
plt.title('Lead Status by Mode of Communication')
plt.xlabel('Mode of Communication')
plt.ylabel('Count of Leads')
plt.legend(title='Lead Status')

# Adding annotations for each bar (status 0 and status 1 counts)
for i, occupation in enumerate(occupation_labels):
    plt.text(i, interaction_counts[0][i] / 2, f'{interaction_counts[0][i]}', ha='center', color='white')
    plt.text(i, interaction_counts[0][i] + (interaction_counts[1][i] / 2), f'{interaction_counts[1][i]}', ha='center', color='black')

plt.tight_layout()
plt.show()


# Calculate conversion rates
conversion_rates = round(interaction_counts.apply(lambda row: row[1] / (row[0] + row[1]), axis=1) * 100, 2)
conversion_rates


# Define lead sources
sources = ['print_media_type1', 'print_media_type2', 'digital_media', 'educational_channels', 'referral']

# Group by 'status'. Select the source columns and convert them to numeric type before summing.
media_sources = data.groupby('status')[sources].apply(lambda x: x.apply(pd.to_numeric, errors='coerce').sum()).T

# Define the correct x-axis labels
source_labels = ['Print Media Type 1', 'Print Media Type 2', 'Digital Media', 'Educational Channels', 'Referral']

# Create the figure for the stacked bar plot with proper labels
plt.figure(figsize=(10, 6))

# Plotting status 0 at the bottom and status 1 on top for each source with overridden x-axis labels
plt.bar(source_labels, media_sources[0], label='0', color='blue')
plt.bar(source_labels, media_sources[1],
        bottom=media_sources[0], label='1', color='orange')

# Adding labels and title
plt.title('Lead Status by Mode of Communication')
plt.xlabel('Mode of Communication')
plt.ylabel('Count of Leads')
plt.legend(title='Lead Status')

# Adding annotations for each bar (status 0 and status 1 counts)
for i, source in enumerate(source_labels):
    plt.text(i, media_sources[0][i] / 2, f'{media_sources[0][i]}', ha='center', color='white')
    plt.text(i, media_sources[0][i] + (media_sources[1][i] / 2), f'{media_sources[1][i]}', ha='center', color='black')

plt.tight_layout()
plt.show()


# Calculate conversion rates
conversion_rates = round(media_sources.apply(lambda row: row[1] / (row[0] + row[1]), axis=1) * 100, 2)
conversion_rates


# Group by 'profile_completed' and 'status' to get the counts for each profile completion level
profile = data.groupby(['profile_completed', 'status']).size().unstack()

# Calculate conversion rates for sorting
# Assuming 'status' == 1 indicates a conversion
profile['conversion_rate'] = profile[1] / (profile[0] + profile[1])

# Sort the profile_completed levels by conversion_rate in descending order
profile_sorted = profile.sort_values(by='conversion_rate', ascending=False)

# Extract the sorted profile_completed labels
profile_labels = profile_sorted.index.tolist()

# Create the figure for the stacked bar plot with proper labels
plt.figure(figsize=(10, 6))

# Plotting status 0 at the bottom and status 1 on top for each profile_completed with sorted labels
plt.bar(profile_labels, profile_sorted[0], label='0', color='blue')
plt.bar(profile_labels, profile_sorted[1],
        bottom=profile_sorted[0], label='1', color='orange')

# Adding labels and title
plt.title('Lead Conversion Status by Profile Completion Level')
plt.xlabel('Profile Completion Level')
plt.ylabel('Count of Leads')
plt.legend(title='Lead Status')

# Adding annotations for each bar (status 0 and status 1 counts)
for i, label in enumerate(profile_labels):
    plt.text(i, profile_sorted[0][i] / 2, f'{profile_sorted[0][i]}', ha='center', color='white')
    plt.text(i, profile_sorted[0][i] + (profile_sorted[1][i] / 2), f'{profile_sorted[1][i]}', ha='center', color='black')

plt.tight_layout()
plt.show()


# Calculate conversion rates
conversion_rates = round(profile.apply(lambda row: row[1] / (row[0] + row[1]), axis=1) * 100, 2)
conversion_rates


# Create a correlation matrix (numerical vlaues only)

# Exclude non-numeric columns
numeric_data = data.select_dtypes(include=np.number)

# Create heatmap
plt.figure(figsize=(15, 7))
sns.heatmap(numeric_data.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="vlag")
plt.title("Correlation Heatmap")
plt.show()


# Generate boxplot 'status' vs 'age'
plot_boxplot(
            x_column='status',
            y_column='age',
            data=dataset,
            title='Status vs Age'
            )


# Generate count plot of 'status' vs 'current_occupation'
plot_countplot(
              x_column='status',
              hue_column='current_occupation',
              data=dataset,
              title='Lead Status vs Current Occupation'
              )


# Generate count plot of 'status' vs 'first_interaction'
plot_countplot(
              x_column='status',
              hue_column='first_interaction',
              data=dataset,
              title='Lead Status vs First Interaction'
              )


# Generate count plot of 'status' vs 'profile_completed'
plot_countplot(
              x_column='status',
              hue_column='profile_completed',
              data=dataset,
              title='Lead Status vs Profile Completed'
              )


# Generate boxplot with 'status' vs 'website_visits'
plot_boxplot(
            x_column='status',
            y_column='website_visits',
            data=dataset,
            title='Status vs Website Visits'
            )


# Generate boxplot with 'status' vs 'time_spent_on_website' on the y-axis
plot_boxplot(
            x_column='status',
            y_column='time_spent_on_website',
            data=dataset,
            title='Status vs Time Spent on Website'
            )


# Generate the boxplot with 'status' vs 'page_views_per_visit'
plot_boxplot(
            x_column='status',
            y_column='page_views_per_visit',
            data=dataset,
            title='Status vs Page Views per Visit'
            )


# Generate count plot of 'status' vs 'last_activity'
plot_countplot(
              x_column='status',
              hue_column='last_activity',
              data=dataset,
              title='Lead Status vs Last Activity'
              )


# Generate count plot of 'status' vs 'print_media_type1'
plot_countplot(
              x_column='status',
              hue_column='print_media_type1',
              data=dataset,
              title='Status vs Print Media Type 1',
              hue_order=['Yes', 'No']
              )


# Generate count plot of 'status' vs 'print_media_type2'
plot_countplot(
              x_column='status',
              hue_column='print_media_type2',
              data=dataset,
              title='Status vs Print Media Type 2',
              hue_order=['Yes', 'No']
              )


# Generate count plot of 'status' vs 'digital_media'
plot_countplot(
              x_column='status',
              hue_column='digital_media',
              data=dataset,
              title='Status vs Digital Media',
              hue_order=['Yes', 'No']
              )


# Generate count plot of 'status' vs 'educational_channels'
plot_countplot(
              x_column='status',
              hue_column='educational_channels',
              data=dataset,
              title='Status vs Educational Channels',
              hue_order=['Yes', 'No']
              )


# Generate count plot of 'status' vs 'referral'
plot_countplot(
              x_column='status',
              hue_column='referral',
              data=dataset,
              title='Status vs Referal',
              hue_order=['Yes', 'No']
              )


# plot 'age' vs 'website_visits'

# Create age bins for better grouping
dataset['age_group'] = pd.cut(dataset['age'], bins=[10, 20, 30, 40, 50, 60, 70], labels=['10-20', '20-30', '30-40', '40-50', '50-60', '60-70'])

# Generate boxplot
plot_boxplot(
            x_column='age_group',
            y_column='website_visits',
            data=dataset,
            title='Website Visits by Age Group'
            )


# Generate the boxplot for 'age' vs 'website_visits'

# Create age bins for better grouping
dataset['age_group'] = pd.cut(dataset['age'], bins=[10, 20, 30, 40, 50, 60, 70], labels=['10-20', '20-30', '30-40', '40-50', '50-60', '60-70'])

# Generate boxplot
plot_boxplot(
            x_column='age_group',
            y_column='website_visits',
            data=dataset,
            title='Website Visits by Age Group'
            )


# Generate the boxplot for 'age' vs 'page_views_per_visit'

# Create age bins for better grouping
dataset['age_group'] = pd.cut(dataset['age'], bins=[10, 20, 30, 40, 50, 60, 70], labels=['10-20', '20-30', '30-40', '40-50', '50-60', '60-70'])

# Generate boxplot
plot_boxplot(
            x_column='age_group',
            y_column='page_views_per_visit',
            data=dataset,
            title='Page Views per Visits by Age Group'
            )


# Generate boxplot for 'website_visits' vs 'time_spent_on_website'
plot_boxplot(
            x_column='website_visits',
            y_column='time_spent_on_website',
            data=dataset,
            title='Website Visits by Time Spetn on Website'
            )


# plot for 'website_visits' vs 'page_views_per_visit'
plot_boxplot(
            figsize=(15, 10),
            x_column='website_visits',
            y_column='page_views_per_visit',
            data=dataset,
            title='Website Visits by Page Views per Visit'
            )


# Perform multicollinearity check

# Select numeric columns for analysis
numeric_cols = ['age', 'website_visits', 'time_spent_on_website', 'page_views_per_visit']
data_numeric = data[numeric_cols]

# Calculate VIF
def calculate_vif(data):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = data.columns
    vif_data["VIF"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    return vif_data

vif_results = calculate_vif(data_numeric)


# Check dataset for missing values
data.isnull().sum()


# Check dataset for duplicated data
data.duplicated().sum()

0


# Get the feature names before making any changes
feature_names = data.columns.tolist()
print(feature_names)

['ID', 'age', 'current_occupation', 'first_interaction', 'profile_completed', 'website_visits', 'time_spent_on_website', 'page_views_per_visit', 'last_activity', 'print_media_type1', 'print_media_type2', 'digital_media', 'educational_channels', 'referral', 'status']


# Drop the Index
data.drop('ID', axis=1, inplace=True)


# Get the feature names after fropping the index
feature_names = data.columns.tolist()
print(feature_names)

['age', 'current_occupation', 'first_interaction', 'profile_completed', 'website_visits', 'time_spent_on_website', 'page_views_per_visit', 'last_activity', 'print_media_type1', 'print_media_type2', 'digital_media', 'educational_channels', 'referral', 'status']


# Perform one hot encoding on categorical features

# Select only the categoricalcolumns
categorical_cols = [
                    'current_occupation',
                    'first_interaction',
                    'profile_completed',
                    'last_activity',
                    'print_media_type1',
                    'print_media_type2',
                    'digital_media',
                    'educational_channels',
                    'referral'
                  ]

# Apply one-hot encoding to categorical variables
# data_encoded = pd.get_dummies(data[categorical_cols], drop_first=True) # What happens to the decision tree if I dont drop the first item
data_encoded = pd.get_dummies(data[categorical_cols])

'''

  The current_occupation_professional is a key indicator of conversion, but that is the feature that is removed IF drop_first=True,
  I will keep this as is, and remove one of the less important ones during feature engineering.

  It's interesting to note that this actually did not make a significant difference in the performance of a few of the model, namely the decision tree.

'''

# Convert boolean values (True/False) to integers (1/0)
data_encoded = data_encoded.astype(int)

# Combine the encoded categorical features with the original numerical columns
numerical_cols = ['age', 'website_visits', 'time_spent_on_website', 'page_views_per_visit', 'status']

data_encoded = pd.concat([data[numerical_cols], data_encoded], axis=1)


# Get the feature names after encoding
feature_names = data_encoded.columns.tolist()
print(feature_names)

['age', 'website_visits', 'time_spent_on_website', 'page_views_per_visit', 'status', 'current_occupation_0', 'current_occupation_1', 'current_occupation_2', 'first_interaction_0', 'first_interaction_1', 'profile_completed_0', 'profile_completed_1', 'profile_completed_2', 'last_activity_0', 'last_activity_1', 'last_activity_2', 'print_media_type1_0', 'print_media_type1_1', 'print_media_type2_0', 'print_media_type2_1', 'digital_media_0', 'digital_media_1', 'educational_channels_0', 'educational_channels_1', 'referral_0', 'referral_1']


# count the number of values in feature_names
len(feature_names)

26


# calculate the change in the number of columns between data and data_encoded
columns_added = data_encoded.shape[1] - data.shape[1]
print(f"Number of columns added due to encoding: {columns_added}")

Number of columns added due to encoding: 12


# List of numerical features previously identified as having outliers
potential_outliers = ['website_visits', 'time_spent_on_website', 'page_views_per_visit']

# Set up the figure with the appropriate number of subplots
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))  # 1 row and 3 columns since we have 3 plots

# Create boxplots for each numerical column
for i, col in enumerate(potential_outliers):
    sns.boxplot(x=data_encoded[col], ax=axes[i], palette='tab10')  # Apply 'tab10' color scheme
    axes[i].set_title(f'Boxplot of {col}')

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()


# Detect and remove outliers using the IQR method
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Remove outliers
data_cleaned = remove_outliers(data_encoded, potential_outliers)


# List of numerical features previously identified as having outliers
potential_outliers = ['website_visits', 'time_spent_on_website', 'page_views_per_visit']

# Set up the figure with the appropriate number of subplots
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))  # 1 row and 3 columns since we have 3 plots

# Create boxplots for each numerical column
for i, col in enumerate(potential_outliers):
    sns.boxplot(x=data_cleaned[col], ax=axes[i], palette='tab10')  # Apply 'tab10' color scheme
    axes[i].set_title(f'Boxplot of {col}')

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()


# Calculate the number of rows removed
rows_removed = len(data_encoded) - len(data_cleaned)
rows_removed

404


# returns the first 5 rows
data_cleaned.head()


# returns the last 5 rows
data_cleaned.tail()


# Determine the number of rows and columns by calling data_cleaned.shape
print(f"{data_cleaned.shape[0]} {data_cleaned.shape[1]}")

4208 26


data_cleaned.describe(include = "all").T


for col in ['age', 'website_visits', 'time_spent_on_website', 'page_views_per_visit']:
    print(col)
    enhanced_histogram_boxplot(data_cleaned, col, kde = True, custom_title = col) # Pass the column name as a string

age

Statistical Summary for age:
Mean: 46.36
Median: 51.00
Standard Deviation: 13.08
Skewness: -0.74
Kurtosis: -0.76

Outlier Analysis:
IQR method - Number of outliers: 0
IQR method - Percentage of outliers: 0.00%
IQR method - Outlier range: < 4.50 or > 88.50
Z-score method - Number of outliers (|z| > 3): 0
Z-score method - Percentage of outliers: 0.00%

website_visits

Statistical Summary for website_visits:
Mean: 3.23
Median: 3.00
Standard Deviation: 2.16
Skewness: 0.80
Kurtosis: -0.14

Outlier Analysis:
IQR method - Number of outliers: 0
IQR method - Percentage of outliers: 0.00%
IQR method - Outlier range: < -2.50 or > 9.50
Z-score method - Number of outliers (|z| > 3): 0
Z-score method - Percentage of outliers: 0.00%

time_spent_on_website

Statistical Summary for time_spent_on_website:
Mean: 718.40
Median: 375.00
Standard Deviation: 742.65
Skewness: 0.97
Kurtosis: -0.54

Outlier Analysis:
IQR method - Number of outliers: 0
IQR method - Percentage of outliers: 0.00%
IQR method - Outlier range: < -1611.12 or > 3063.88
Z-score method - Number of outliers (|z| > 3): 0
Z-score method - Percentage of outliers: 0.00%

page_views_per_visit

Statistical Summary for page_views_per_visit:
Mean: 2.72
Median: 2.29
Standard Deviation: 1.49
Skewness: 0.02
Kurtosis: -0.32

Outlier Analysis:
IQR method - Number of outliers: 11
IQR method - Percentage of outliers: 0.26%
IQR method - Outlier range: < -0.31 or > 6.02
Z-score method - Number of outliers (|z| > 3): 0
Z-score method - Percentage of outliers: 0.00%


# Original Categories (w/Manual Mapping)
original_categories = {
    'current_occupation': ['Professional', 'Unemployed', 'Student'],
    'first_interaction': ['Website', 'Mobile App'],
    'profile_completed':  ['Low', 'Medium', 'High'],
    'last_activity': ['Email Activity', 'Phone Activity', 'Website Activity'],
    'print_media_type1': ['No', 'Yes'],
    'print_media_type2': ['No', 'Yes'],
    'digital_media': ['No', 'Yes'],
    'educational_channels': ['No', 'Yes'],
    'referral': ['No', 'Yes']
    # Add other variables and their categories here
}

# Initialize the mapping dictionary
ohe_mapping = {}

# Get the columns from the DataFrame
columns = data_cleaned.columns

for col in columns:
    try:
        variable, category_idx = col.rsplit('_', 1)
        category_label = original_categories.get(variable, [])[int(category_idx)] if variable in original_categories else category_idx
    except (ValueError, IndexError):
        variable = col
        category_label = None  # Handle unexpected naming patterns

    if variable not in ohe_mapping:
        ohe_mapping[variable] = {}

    ohe_mapping[variable][col] = category_label

# Iterate and print value counts with original labels
for variable, cols in ohe_mapping.items():
    print(f"Variable: {variable}")
    for ohe_col, category_label in cols.items():
        print(f"  Category: {category_label}")
        print(data_cleaned[ohe_col].value_counts(normalize=True))
        print('*' * 40)
    print('=' * 60)

Variable: age
  Category: None
age
57    0.084601
58    0.082700
56    0.072719
59    0.071055
60    0.052281
55    0.042063
32    0.039686
53    0.020437
50    0.019724
43    0.019487
48    0.019487
54    0.019487
51    0.019011
49    0.018774
46    0.018774
21    0.018298
52    0.018061
42    0.017823
24    0.017823
23    0.017348
45    0.017348
19    0.017110
47    0.017110
34    0.016873
44    0.016635
33    0.016160
20    0.015447
22    0.015209
41    0.015209
35    0.014971
18    0.014496
40    0.013783
38    0.013070
37    0.012595
36    0.012357
39    0.011644
63    0.010932
62    0.010456
30    0.009743
29    0.007842
61    0.007842
31    0.007842
28    0.005703
25    0.003802
26    0.003327
27    0.002852
Name: proportion, dtype: float64
****************************************
============================================================
Variable: website
  Category: visits
website_visits
2    0.272338
1    0.171578
3    0.144487
4    0.110266
5    0.092681
6    0.063926
7    0.051331
0    0.041350
8    0.033983
9    0.018061
Name: proportion, dtype: float64
****************************************
============================================================
Variable: time_spent_on
  Category: website
time_spent_on_website
0       0.041350
1       0.015922
65      0.004515
83      0.004040
76      0.004040
          ...   
2500    0.000238
1540    0.000238
1862    0.000238
1397    0.000238
2290    0.000238
Name: proportion, Length: 1541, dtype: float64
****************************************
============================================================
Variable: page_views_per
  Category: visit
page_views_per_visit
0.000    0.043013
2.168    0.003327
2.154    0.003089
2.192    0.002614
2.188    0.002376
           ...   
1.826    0.000238
4.954    0.000238
4.295    0.000238
5.577    0.000238
2.692    0.000238
Name: proportion, Length: 2126, dtype: float64
****************************************
============================================================
Variable: status
  Category: None
status
0    0.698432
1    0.301568
Name: proportion, dtype: float64
****************************************
============================================================
Variable: current_occupation
  Category: Professional
current_occupation_0
1    0.568679
0    0.431321
Name: proportion, dtype: float64
****************************************
  Category: Unemployed
current_occupation_1
0    0.88403
1    0.11597
Name: proportion, dtype: float64
****************************************
  Category: Student
current_occupation_2
0    0.684648
1    0.315352
Name: proportion, dtype: float64
****************************************
============================================================
Variable: first_interaction
  Category: Website
first_interaction_0
0    0.55347
1    0.44653
Name: proportion, dtype: float64
****************************************
  Category: Mobile App
first_interaction_1
1    0.55347
0    0.44653
Name: proportion, dtype: float64
****************************************
============================================================
Variable: profile_completed
  Category: Low
profile_completed_0
0    0.505703
1    0.494297
Name: proportion, dtype: float64
****************************************
  Category: Medium
profile_completed_1
0    0.976949
1    0.023051
Name: proportion, dtype: float64
****************************************
  Category: High
profile_completed_2
0    0.517348
1    0.482652
Name: proportion, dtype: float64
****************************************
============================================================
Variable: last_activity
  Category: Email Activity
last_activity_0
0    0.503327
1    0.496673
Name: proportion, dtype: float64
****************************************
  Category: Phone Activity
last_activity_1
0    0.736217
1    0.263783
Name: proportion, dtype: float64
****************************************
  Category: Website Activity
last_activity_2
0    0.760456
1    0.239544
Name: proportion, dtype: float64
****************************************
============================================================
Variable: print_media_type1
  Category: No
print_media_type1_0
1    0.891635
0    0.108365
Name: proportion, dtype: float64
****************************************
  Category: Yes
print_media_type1_1
0    0.891635
1    0.108365
Name: proportion, dtype: float64
****************************************
============================================================
Variable: print_media_type2
  Category: No
print_media_type2_0
1    0.948669
0    0.051331
Name: proportion, dtype: float64
****************************************
  Category: Yes
print_media_type2_1
0    0.948669
1    0.051331
Name: proportion, dtype: float64
****************************************
============================================================
Variable: digital_media
  Category: No
digital_media_0
1    0.886882
0    0.113118
Name: proportion, dtype: float64
****************************************
  Category: Yes
digital_media_1
0    0.886882
1    0.113118
Name: proportion, dtype: float64
****************************************
============================================================
Variable: educational_channels
  Category: No
educational_channels_0
1    0.848859
0    0.151141
Name: proportion, dtype: float64
****************************************
  Category: Yes
educational_channels_1
0    0.848859
1    0.151141
Name: proportion, dtype: float64
****************************************
============================================================
Variable: referral
  Category: No
referral_0
1    0.980513
0    0.019487
Name: proportion, dtype: float64
****************************************
  Category: Yes
referral_1
0    0.980513
1    0.019487
Name: proportion, dtype: float64
****************************************
============================================================


# Create a correlation matrix (numerical vlaues only)

# Exclude non-numeric columns
numeric_data = data_cleaned.select_dtypes(include=np.number)

# Create heatmap
plt.figure(figsize=(15, 7))
sns.heatmap(numeric_data.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="vlag")
plt.title("Correlation Heatmap")
plt.show()


# Perform multicollinearity check for numeric columns only

# Select numeric columns for VIF analysis
numeric_cols = ['age', 'website_visits', 'time_spent_on_website', 'page_views_per_visit']
data_numeric = data[numeric_cols]

# Function to calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = data.columns
    vif_data["VIF"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    return vif_data

# Calculate VIF for numeric columns only
vif_results_numeric = calculate_vif(data_numeric)


# Separating the target variable and other variables
X = data_cleaned.drop('status', axis=1)   # Create a copy of the data with 'status' removed
y = data_cleaned['status']                # Create a new variable with only 'status'


# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


# Check the shape of both the train and test data

#Collect the data for the table
data_ = {
    "Data Split": ["Training Set", "Test Set"],
    "Shape": [X_train.shape, X_test.shape],
    "Class Percentage": [
        y_train.value_counts(normalize=True).to_dict(),
        y_test.value_counts(normalize=True).to_dict()
    ]
}

# Create the table using pandas
shape = pd.DataFrame(data_)

# Display the table
print(shape)

     Data Split       Shape                                Class Percentage
0  Training Set  (2945, 25)  {0: 0.6937181663837012, 1: 0.3062818336162988}
1      Test Set  (1263, 25)  {0: 0.7094220110847189, 1: 0.2905779889152811}


# Missing value check for X_train and X_test as a percentage

# Create DataFrames for % of missing values in each dataset
train_missing = pd.DataFrame({
    '% of Missing Values (Train)': round(X_train.isna().sum() / X_train.isna().count() * 100, 2)
})

test_missing = pd.DataFrame({
    '% of Missing Values (Test)': round(X_test.isna().sum() / X_test.isna().count() * 100, 2)
})

# Concatenate the two DataFrames to have a nice table comparing both
missing_values_table = pd.concat([train_missing, test_missing], axis=1)

# Display the table
missing_values_table


# Standardize Numeric Features

# List of numeric columns that need to be scaled
numeric_cols = ['age', 'website_visits', 'time_spent_on_website', 'page_views_per_visit']

# Identify non-numeric columns
non_numeric_cols = [col for col in X_train.columns if col not in numeric_cols]

# Initialize the scaler
scaler = StandardScaler()

# Scale the numeric columns in the training set
X_train_scaled_numeric = pd.DataFrame(
                                      scaler.fit_transform(X_train[numeric_cols]),
                                      columns=numeric_cols,
                                      index=X_train.index  # Retain original index
                                      )

# Scale the numeric columns in the test set
X_test_scaled_numeric = pd.DataFrame(
                                      scaler.transform(X_test[numeric_cols]),
                                      columns=numeric_cols,
                                      index=X_test.index  # Retain original index
                                      )

# Extract the non-numeric columns from the training set without resetting the index
X_train_non_numeric = X_train[non_numeric_cols]

# Extract the non-numeric columns from the test set without resetting the index
X_test_non_numeric = X_test[non_numeric_cols]

# Concatenate the scaled numeric columns with the non-numeric columns for the training set
X_train_final = pd.concat([X_train_scaled_numeric, X_train_non_numeric], axis=1)

# Concatenate the scaled numeric columns with the non-numeric columns for the test set
X_test_final = pd.concat([X_test_scaled_numeric, X_test_non_numeric], axis=1)

# Preserve the original column order
X_train_scaled = X_train_final[X_train.columns]
X_test_scaled  = X_test_final[X_test.columns]


# Initialize and fit the logistic regression model
logreg = LogisticRegression(max_iter=1000,random_state=1)
logreg.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000, random_state=1)

LogisticRegression(max_iter=1000, random_state=1)


# Evaluate the Model on the Training Data
y_train_pred = logreg.predict(X_train_scaled)

# Evaluate performance on training data
metrics_score(y_train, y_train_pred)

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      2043
           1       0.74      0.66      0.70       902

    accuracy                           0.82      2945
   macro avg       0.80      0.78      0.79      2945
weighted avg       0.82      0.82      0.82      2945


# Evaluate the Model on the Training Data
y_test_pred = logreg.predict(X_test_scaled)

# Evaluate performance on training data
metrics_score(y_test, y_test_pred)

              precision    recall  f1-score   support

           0       0.86      0.89      0.88       896
           1       0.72      0.66      0.68       367

    accuracy                           0.82      1263
   macro avg       0.79      0.77      0.78      1263
weighted avg       0.82      0.82      0.82      1263


# Apply PCA to the entire feature set after scaling (including numeric + other features)
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Print the number of components chosen by PCA
print(f"Number of components retained: {X_train_pca.shape[1]}")

Number of components retained: 12


# Initialize and fit the decision tree on the PCA-transformed training data
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X_train_pca, y_train)

DecisionTreeClassifier(random_state=1)

DecisionTreeClassifier(random_state=1)


# Generate predictions on the training data
y_train_pred_pca = dt_model.predict(X_train_pca)

# Evaluate the model's performance on the training data
train_accuracy_pca = accuracy_score(y_train, y_train_pred_pca)
metrics_score(y_train, y_train_pred_pca)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2043
           1       1.00      1.00      1.00       902

    accuracy                           1.00      2945
   macro avg       1.00      1.00      1.00      2945
weighted avg       1.00      1.00      1.00      2945


# Generate predictions on the test data
y_test_pred_pca = dt_model.predict(X_test_pca)

# Evaluate the model's performance on the test data
test_accuracy_pca = accuracy_score(y_test, y_test_pred_pca)
metrics_score(y_test, y_test_pred_pca)

              precision    recall  f1-score   support

           0       0.85      0.83      0.84       896
           1       0.60      0.64      0.62       367

    accuracy                           0.77      1263
   macro avg       0.72      0.73      0.73      1263
weighted avg       0.78      0.77      0.77      1263


# Fitting the decision tree classifier on the training data
d_tree =  DecisionTreeClassifier(random_state=1)
d_tree.fit(X_train,y_train)

DecisionTreeClassifier(random_state=1)

DecisionTreeClassifier(random_state=1)


# Checking performance on the training data
y_pred_train = d_tree.predict(X_train)

# Instantiate the metrics_score function
metrics_score(y_train, y_pred_train)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2043
           1       1.00      1.00      1.00       902

    accuracy                           1.00      2945
   macro avg       1.00      1.00      1.00      2945
weighted avg       1.00      1.00      1.00      2945


# Checking performance on the test data
y_pred_test = d_tree.predict(X_test)

# Instantiate the metrics_score function
metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.86      0.85      0.86       896
           1       0.65      0.67      0.66       367

    accuracy                           0.80      1263
   macro avg       0.76      0.76      0.76      1263
weighted avg       0.80      0.80      0.80      1263


# Plot the feature importance

importances = d_tree.feature_importances_  # Use d_tree
columns = X.columns  # Use columns from the DataFrame used with d_tree
importance_df = pd.DataFrame(importances, index=columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
print(importance_df)

# Plot
plt.figure(figsize=(13, 13))
sns.barplot(data=importance_df, x='Importance', y=importance_df.index, palette='tab10')
plt.title('Feature Importance in Decision Tree')
plt.show()

                        Importance
time_spent_on_website     0.286512
first_interaction_0       0.154206
profile_completed_0       0.134657
page_views_per_visit      0.095233
age                       0.091493
current_occupation_0      0.066631
last_activity_1           0.047065
website_visits            0.039448
last_activity_2           0.024113
digital_media_0           0.008002
current_occupation_2      0.007526
last_activity_0           0.005211
referral_0                0.005136
educational_channels_0    0.004923
profile_completed_1       0.004662
print_media_type1_0       0.004083
profile_completed_2       0.003841
digital_media_1           0.003714
print_media_type2_0       0.003433
educational_channels_1    0.003226
print_media_type1_1       0.002581
print_media_type2_1       0.002551
current_occupation_1      0.001505
referral_1                0.000248
first_interaction_1       0.000000


# Set the size of the plot
plt.figure(figsize=(20, 20))

# Plot the tree using plot_tree from Scikit-learn
tree.plot_tree(
              d_tree,
              feature_names=X.columns,  # Column names for feature names
              class_names=True,         # Display class names
              filled=True,              # Color nodes by class
              rounded=True,             # Rounded boxes for nodes
              proportion=False,         # Not scaled to the proportion of samples at each node
              fontsize=8,               # Font size for labels
              max_depth = 3             # Limit the depth of the tree to 3 levels
              )

# Display the plot
plt.show()


# Determine the optimum depth to prune

# Define the range of depths to test
depth_range = range(1, 21)  # Test depths from 1 to 20
cv_scores = []  # List to store cross-validation scores

# Loop over the depths and perform cross-validation
for depth in depth_range:
    # Create a decision tree with the current max_depth
    pruned_tree = DecisionTreeClassifier(random_state=1, max_depth=depth)

    # Perform cross-validation (5-fold cross-validation in this case)
    scores = cross_val_score(pruned_tree, X_train, y_train, cv=5, scoring='accuracy')  # You can change scoring to 'f1', 'precision', 'recall', etc.

    # Append the mean score for this depth
    cv_scores.append(np.mean(scores))

# Find the depth with the best cross-validation score
optimal_depth = depth_range[np.argmax(cv_scores)]
print(f"The optimal max_depth is: {optimal_depth}")

# Plot the results to visualize
plt.figure(figsize=(10, 6))
plt.plot(depth_range, cv_scores, marker='o')
plt.xlabel('Max Depth')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Cross-Validation Accuracy vs Max Depth')
plt.show()

The optimal max_depth is: 6


# Build the model
pruned_tree_depth_max = DecisionTreeClassifier(random_state=1, max_depth=optimal_depth)
pruned_tree_depth_max.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, random_state=1)

DecisionTreeClassifier(max_depth=6, random_state=1)


# Train the final decision tree with the optimal depth
y_pred_train_depth_max = pruned_tree_depth_max.predict(X_train)

# Instantiate the metrics_score function
metrics_score(y_train, y_pred_train_depth_max)

              precision    recall  f1-score   support

           0       0.90      0.95      0.92      2043
           1       0.87      0.75      0.80       902

    accuracy                           0.89      2945
   macro avg       0.88      0.85      0.86      2945
weighted avg       0.89      0.89      0.89      2945


# Checking performance on the training data
y_pred_test_depth_max = pruned_tree_depth_max.predict(X_test)

# Instantiate the metrics_score function
metrics_score(y_test, y_pred_test_depth_max)

              precision    recall  f1-score   support

           0       0.86      0.92      0.89       896
           1       0.77      0.63      0.69       367

    accuracy                           0.84      1263
   macro avg       0.81      0.78      0.79      1263
weighted avg       0.83      0.84      0.83      1263


# Create a decision tree with pruning by limiting the depth
pruned_tree_depth_5 = DecisionTreeClassifier(random_state=1, max_depth=5)
pruned_tree_depth_5.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, random_state=1)

DecisionTreeClassifier(max_depth=5, random_state=1)


# Checking performance on the training data
y_pred_train_pruned_depth_5 = pruned_tree_depth_5.predict(X_train)
metrics_score(y_train, y_pred_train_pruned_depth_5)

              precision    recall  f1-score   support

           0       0.89      0.94      0.92      2043
           1       0.84      0.75      0.79       902

    accuracy                           0.88      2945
   macro avg       0.87      0.84      0.85      2945
weighted avg       0.88      0.88      0.88      2945


# Checking performance on the training data
y_pred_test_pruned_depth_5 = pruned_tree_depth_5.predict(X_test)
metrics_score(y_test, y_pred_test_pruned_depth_5)

              precision    recall  f1-score   support

           0       0.89      0.92      0.90       896
           1       0.79      0.71      0.75       367

    accuracy                           0.86      1263
   macro avg       0.84      0.82      0.83      1263
weighted avg       0.86      0.86      0.86      1263


# Create a decision tree with pruning by limiting the depth
pruned_tree_depth_7 = DecisionTreeClassifier(random_state=1, max_depth=7)
pruned_tree_depth_7.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=7, random_state=1)

DecisionTreeClassifier(max_depth=7, random_state=1)


# Check the performance after pruning
y_pred_train_pruned_depth_7 = pruned_tree_depth_7.predict(X_train)
metrics_score(y_train, y_pred_train_pruned_depth_7)

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      2043
           1       0.87      0.81      0.84       902

    accuracy                           0.90      2945
   macro avg       0.89      0.88      0.89      2945
weighted avg       0.90      0.90      0.90      2945


# Check the performance after pruning
y_pred_test_pruned_depth_7 = pruned_tree_depth_7.predict(X_test)
metrics_score(y_test, y_pred_test_pruned_depth_7)

              precision    recall  f1-score   support

           0       0.87      0.90      0.89       896
           1       0.74      0.68      0.71       367

    accuracy                           0.84      1263
   macro avg       0.81      0.79      0.80      1263
weighted avg       0.83      0.84      0.84      1263


# Create a decision tree with pruning by limiting minimum samples per leaf
pruned_tree_leaf = DecisionTreeClassifier(random_state=1, min_samples_leaf=5)  # Set min_samples_leaf to 5 as an example
pruned_tree_leaf.fit(X_train, y_train)

DecisionTreeClassifier(min_samples_leaf=5, random_state=1)

DecisionTreeClassifier(min_samples_leaf=5, random_state=1)


# Check the performance after pruning
y_pred_train_pruned_leaf = pruned_tree_leaf.predict(X_train)
metrics_score(y_train, y_pred_train_pruned_leaf)

              precision    recall  f1-score   support

           0       0.94      0.95      0.95      2043
           1       0.89      0.86      0.88       902

    accuracy                           0.93      2945
   macro avg       0.92      0.91      0.91      2945
weighted avg       0.92      0.93      0.92      2945


# Check the performance after pruning
y_pred_test_pruned_leaf = pruned_tree_leaf.predict(X_test)
metrics_score(y_test, y_pred_test_pruned_leaf)

              precision    recall  f1-score   support

           0       0.86      0.86      0.86       896
           1       0.66      0.67      0.66       367

    accuracy                           0.80      1263
   macro avg       0.76      0.76      0.76      1263
weighted avg       0.80      0.80      0.80      1263


# Import necessary libraries
from Scikit-learn.model_selection import cross_val_score
import numpy as np

# Define the range of values for min_samples_split to test
split_range = range(2, 51, 2)  # Testing even values from 2 to 50
cv_scores_split = []  # List to store cross-validation scores

# Loop over the different min_samples_split values
for split in split_range:
    # Create a decision tree with the current min_samples_split value
    pruned_tree_split = DecisionTreeClassifier(random_state=1, min_samples_split=split)

    # Perform cross-validation (5-fold cross-validation in this case)
    scores = cross_val_score(pruned_tree_split, X_train, y_train, cv=5, scoring='accuracy')

    # Append the mean score for this split value
    cv_scores_split.append(np.mean(scores))

# Find the min_samples_split value with the best cross-validation score
optimal_split = split_range[np.argmax(cv_scores_split)]
print(f"The optimal min_samples_split is: {optimal_split}")

# Plot the results to visualize
plt.figure(figsize=(10, 6))
plt.plot(split_range, cv_scores_split, marker='o')
plt.xlabel('Min Samples Split')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Cross-Validation Accuracy vs Min Samples Split')
plt.show()

The optimal min_samples_split is: 48


# Create a decision tree with pruning by limiting the minimum number of samples required to split a node
pruned_tree_split = DecisionTreeClassifier(random_state=1, min_samples_split=48)
pruned_tree_split.fit(X_train, y_train)

DecisionTreeClassifier(min_samples_split=48, random_state=1)

DecisionTreeClassifier(min_samples_split=48, random_state=1)


# Check the performance after pruning
y_pred_train_pruned_split = pruned_tree_split.predict(X_train)
metrics_score(y_train, y_pred_train_pruned_split)

              precision    recall  f1-score   support

           0       0.91      0.94      0.92      2043
           1       0.85      0.79      0.82       902

    accuracy                           0.89      2945
   macro avg       0.88      0.86      0.87      2945
weighted avg       0.89      0.89      0.89      2945


# Check the performance after pruning
y_pred_test_pruned_split = pruned_tree_split.predict(X_test)
metrics_score(y_test, y_pred_test_pruned_split)

              precision    recall  f1-score   support

           0       0.87      0.90      0.88       896
           1       0.73      0.67      0.70       367

    accuracy                           0.83      1263
   macro avg       0.80      0.79      0.79      1263
weighted avg       0.83      0.83      0.83      1263


# Create a results table for pruned decision trees

# List of pruning methods and their corresponding predictions
pruning_methods = [
    ("Max Depth (Optimal)", y_pred_test_depth_max),
    ("Max Depth (5)", y_pred_test_pruned_depth_5),
    ("Max Depth (7)", y_pred_test_pruned_depth_7),
    ("Min Samples Leaf (5)", y_pred_test_pruned_leaf),
    ("Min Samples Split (48)", y_pred_test_pruned_split)
]

# Initialize a list to store the results
results = []

# Loop through each pruning method and calculate the metrics
for method_name, y_pred in pruning_methods:
    accuracy = round(accuracy_score(y_test, y_pred), 2)
    precision_0 = round(precision_score(y_test, y_pred, pos_label=0), 2)
    precision_1 = round(precision_score(y_test, y_pred, pos_label=1), 2)
    recall_0 = round(recall_score(y_test, y_pred, pos_label=0), 2)
    recall_1 = round(recall_score(y_test, y_pred, pos_label=1), 2)
    f1_0 = round(f1_score(y_test, y_pred, pos_label=0), 2)
    f1_1 = round(f1_score(y_test, y_pred, pos_label=1), 2)

    # Append the results to the list
    results.append([method_name, accuracy, precision_0, precision_1, recall_0, recall_1, f1_0, f1_1])

# Create a DataFrame for better visualization
pruned_results = pd.DataFrame(results, columns=[
    "Pruning Method",
    "Accuracy",
    "Precision Status 0",
    "Precision Status 1",
    "Recall Status 0",
    "Recall Status 1",
    "F1-Score Status 0",
    "F1-Score Status 1"
])

# Display the DataFrame
pruned_results


# Check the distribution of the target variable
class_distribution = y_train.value_counts(normalize=True)

# Display the class distribution in percentage
class_distribution_percentage = class_distribution * 100
class_distribution_percentage


# Alternatively, we can use visualization to check the imbalance:
import matplotlib.pyplot as plt

# Plot the class distribution
class_distribution.plot(kind='bar', color=['skyblue', 'orange'])
plt.title('Status Distribution in y_train')
plt.xlabel('Status')
plt.ylabel('Frequency')
plt.show()


# Calculate class weights manually
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
print(class_weight_dict)

{0: 0.7207537934410181, 1: 1.6324833702882484}


# Use the computed class weights in the decision tree
tuned_tree_tuned_rand = DecisionTreeClassifier(class_weight=class_weight_dict)

# Fit the model
tuned_tree_tuned_rand.fit(X_train, y_train)

DecisionTreeClassifier(class_weight={0: 0.7207537934410181,
                                     1: 1.6324833702882484})

DecisionTreeClassifier(class_weight={0: 0.7207537934410181,
                                     1: 1.6324833702882484})


# Predict on the test set
y_pred_class_weight = tuned_tree_tuned_rand.predict(X_train)
metrics_score(y_train, y_pred_class_weight)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2043
           1       1.00      1.00      1.00       902

    accuracy                           1.00      2945
   macro avg       1.00      1.00      1.00      2945
weighted avg       1.00      1.00      1.00      2945


# Predict on the test set
y_pred_class_weight = tuned_tree_tuned_rand.predict(X_test)
metrics_score(y_test, y_pred_class_weight)

              precision    recall  f1-score   support

           0       0.87      0.86      0.86       896
           1       0.66      0.68      0.67       367

    accuracy                           0.81      1263
   macro avg       0.76      0.77      0.77      1263
weighted avg       0.81      0.81      0.81      1263


# Set up the decision tree with class weights
dt_model = DecisionTreeClassifier()

# Define a grid of class weights to search through
param_grid = {'class_weight': [{0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 5}]}

# Perform a grid search to find the best class weights
grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train, y_train)

# Output the best parameters (weights) found
print("Best class weights:", grid_search.best_params_)

Best class weights: {'class_weight': {0: 1, 1: 2}}


# Set up the decision tree with class weights
dt_model = DecisionTreeClassifier()

# Define a grid of class weights to search through
param_grid = {'class_weight': [{0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 5}]}

# Perform a grid search to find the best class weights
grid_search_class_weight = GridSearchCV(dt_model, param_grid, cv=5, scoring='f1_weighted', return_train_score=True)
grid_search_class_weight.fit(X_train, y_train)

# Iterate over the results and print evaluation metrics
for i, params in enumerate(grid_search.cv_results_['params']):
    print(f"\nEvaluation for class weights: {params}")

    # Refit the model on the training set with the current parameters
    dt_model_class_weight = DecisionTreeClassifier(class_weight=params['class_weight'])
    dt_model_class_weight.fit(X_train, y_train)

    # Predict on the test set
    y_pred__class_weight = dt_model_class_weight.predict(X_test)

    # Calculate accuracy, precision, recall, and F1-score
    accuracy = accuracy_score(y_test, y_pred__class_weight)
    report = classification_report(y_test, y_pred__class_weight, target_names=['Class 0', 'Class 1'])

    # Output metrics for each set of weights
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:\n{report}")

# Output the best parameters (weights) found by GridSearchCV
print("\nBest class weights found by GridSearchCV:", grid_search.best_params_)

Evaluation for class weights: {'class_weight': {0: 1, 1: 2}}
Accuracy: 0.8124
Classification Report:
              precision    recall  f1-score   support

     Class 0       0.87      0.87      0.87       896
     Class 1       0.68      0.68      0.68       367

    accuracy                           0.81      1263
   macro avg       0.77      0.77      0.77      1263
weighted avg       0.81      0.81      0.81      1263


Evaluation for class weights: {'class_weight': {0: 1, 1: 3}}
Accuracy: 0.7965
Classification Report:
              precision    recall  f1-score   support

     Class 0       0.86      0.85      0.86       896
     Class 1       0.65      0.66      0.65       367

    accuracy                           0.80      1263
   macro avg       0.75      0.76      0.75      1263
weighted avg       0.80      0.80      0.80      1263


Evaluation for class weights: {'class_weight': {0: 1, 1: 5}}
Accuracy: 0.8052
Classification Report:
              precision    recall  f1-score   support

     Class 0       0.87      0.85      0.86       896
     Class 1       0.65      0.70      0.68       367

    accuracy                           0.81      1263
   macro avg       0.76      0.77      0.77      1263
weighted avg       0.81      0.81      0.81      1263


Best class weights found by GridSearchCV: {'class_weight': {0: 1, 1: 2}}


# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=1)

# Train the model
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

RandomForestClassifier(random_state=1)


# Predict on the test set
y_pred_rf = rf_model.predict(X_train)
metrics_score(y_train, y_pred_rf)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2043
           1       1.00      1.00      1.00       902

    accuracy                           1.00      2945
   macro avg       1.00      1.00      1.00      2945
weighted avg       1.00      1.00      1.00      2945


# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
metrics_score(y_test, y_pred_rf)

              precision    recall  f1-score   support

           0       0.88      0.90      0.89       896
           1       0.73      0.70      0.72       367

    accuracy                           0.84      1263
   macro avg       0.81      0.80      0.80      1263
weighted avg       0.84      0.84      0.84      1263


# Define the model
rf_model = RandomForestClassifier(random_state=1)

# Define a grid of hyperparameters for "pruning"
param_grid = {
    'max_depth': [4, 6, 8, 10],              # Pruning by limiting depth
    'min_samples_split': [2, 5, 10],         # Minimum samples to split
    'min_samples_leaf': [1, 2, 5],           # Minimum samples in each leaf
    'max_features': ['sqrt', 'log2', None],  # Number of features to consider for splitting
    'n_estimators': [50, 100, 200],          # Number of trees in the forest
}

# Perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Output the best parameters (pruning configuration) found
print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': 6, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


# Build the model
best_pruned_rf_model = grid_search.best_estimator_
best_pruned_rf_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, max_features=None, min_samples_split=5,
                       n_estimators=200, random_state=1)

RandomForestClassifier(max_depth=6, max_features=None, min_samples_split=5,
                       n_estimators=200, random_state=1)


# Train and evaluate the pruned random forest model
y_pred = best_pruned_rf_model.predict(X_train)
metrics_score(y_train, y_pred)

              precision    recall  f1-score   support

           0       0.91      0.94      0.92      2043
           1       0.85      0.79      0.82       902

    accuracy                           0.89      2945
   macro avg       0.88      0.86      0.87      2945
weighted avg       0.89      0.89      0.89      2945


# Test and evaluate the pruned random forest model
y_pred = best_pruned_rf_model.predict(X_test)
metrics_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       896
           1       0.78      0.74      0.76       367

    accuracy                           0.86      1263
   macro avg       0.83      0.82      0.83      1263
weighted avg       0.86      0.86      0.86      1263


# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=1)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],  # Number of trees
    'max_depth': [4, 6, 8, 10],  # Limiting depth (pruning)
    'min_samples_split': [2, 5, 10],  # Control for tree growth
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples in a leaf node
    'max_features': ['sqrt', 'log2']  # Features to consider when splitting
}

# Set up GridSearchCV for hypertuning
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best parameters: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


# Train and evaluate the pruned random forest model
y_pred = grid_search.best_estimator_.predict(X_train)
metrics_score(y_train, y_pred)

              precision    recall  f1-score   support

           0       0.91      0.95      0.93      2043
           1       0.86      0.79      0.82       902

    accuracy                           0.90      2945
   macro avg       0.89      0.87      0.88      2945
weighted avg       0.90      0.90      0.90      2945


# Test and evaluate the pruned random forest model
y_pred = grid_search.best_estimator_.predict(X_test)
metrics_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       896
           1       0.77      0.70      0.73       367

    accuracy                           0.85      1263
   macro avg       0.83      0.81      0.82      1263
weighted avg       0.85      0.85      0.85      1263


# Train a Random Forest model with class weight adjustment
rf_class_weight = RandomForestClassifier(class_weight='balanced', random_state=1, n_estimators=100)
rf_class_weight.fit(X_train, y_train)

# Predict on the test set
y_pred_class_weight = rf_class_weight.predict(X_test)

# Generate the classification report
class_weight_report = classification_report(y_test, y_pred_class_weight, output_dict=True)
class_weight_report_df = pd.DataFrame(class_weight_report).transpose()

# Display the classification report
print(class_weight_report_df)

              precision    recall  f1-score      support
0              0.880694  0.906250  0.893289   896.000000
1              0.753666  0.700272  0.725989   367.000000
accuracy       0.846397  0.846397  0.846397     0.846397
macro avg      0.817180  0.803261  0.809639  1263.000000
weighted avg   0.843782  0.846397  0.844675  1263.000000


pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.10/dist-packages (from imblearn) (0.12.3)
Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (1.26.4)
Requirement already satisfied: scipy>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (1.13.1)
Requirement already satisfied: scikit-learn>=1.0.2 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (1.5.2)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from imbalanced-learn->imblearn) (3.5.0)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data to oversample the minority class
smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train the Random Forest model on SMOTE-resampled data
rf_smote = RandomForestClassifier(random_state=1, n_estimators=100)
rf_smote.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_smote = rf_smote.predict(X_test)

# Generate the classification report
smote_report = classification_report(y_test, y_pred_smote, output_dict=True)
smote_report_df = pd.DataFrame(smote_report).transpose()

# Display the SMOTE classification report
print(smote_report_df)

              precision    recall  f1-score      support
0              0.888765  0.891741  0.890251   896.000000
1              0.733516  0.727520  0.730506   367.000000
accuracy       0.844022  0.844022  0.844022     0.844022
macro avg      0.811141  0.809631  0.810378  1263.000000
weighted avg   0.843653  0.844022  0.843832  1263.000000


from imblearn.under_sampling import RandomUnderSampler

# Apply undersampling to the training data
undersample = RandomUnderSampler(random_state=1)
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

# Train the Random Forest model on the undersampled data
rf_undersample = RandomForestClassifier(random_state=1, n_estimators=100)
rf_undersample.fit(X_train_under, y_train_under)

# Predict on the test set
y_pred_undersample = rf_undersample.predict(X_test)

# Generate the classification report
undersample_report = classification_report(y_test, y_pred_undersample, output_dict=True)
undersample_report_df = pd.DataFrame(undersample_report).transpose()

# Display the classification report
print(undersample_report_df)

              precision    recall  f1-score      support
0              0.923267  0.832589  0.875587   896.000000
1              0.670330  0.831063  0.742092   367.000000
accuracy       0.832146  0.832146  0.832146     0.832146
macro avg      0.796798  0.831826  0.808840  1263.000000
weighted avg   0.849769  0.832146  0.836796  1263.000000


from imblearn.combine import SMOTEENN

# Apply SMOTE + ENN (hybrid oversampling and undersampling) to the training data
smote_enn = SMOTEENN(random_state=1)
X_train_hybrid, y_train_hybrid = smote_enn.fit_resample(X_train, y_train)

# Train the Random Forest model on the hybrid resampled data
rf_hybrid = RandomForestClassifier(random_state=1, n_estimators=100)
rf_hybrid.fit(X_train_hybrid, y_train_hybrid)

# Predict on the test set
y_pred_hybrid = rf_hybrid.predict(X_test)

# Generate the classification report
hybrid_report = classification_report(y_test, y_pred_hybrid, output_dict=True)
hybrid_report_df = pd.DataFrame(hybrid_report).transpose()

# Display the classification report
print(hybrid_report_df)

              precision    recall  f1-score     support
0              0.923913  0.758929  0.833333   896.00000
1              0.590133  0.847411  0.695749   367.00000
accuracy       0.784640  0.784640  0.784640     0.78464
macro avg      0.757023  0.803170  0.764541  1263.00000
weighted avg   0.826924  0.784640  0.793354  1263.00000


# Generate a table for easy comparion of all methods used in analaysis

# Define the columns for the table
columns = ['Model', 'Dataset', 'Accuracy', 'Precision Class 0', 'Precision Class 1',
           'Recall Class 0', 'Recall Class 1', 'F1-Score Class 0', 'F1-Score Class 1']

# Data for each model (Train and Test results for each model)
data = [
    ['Logistic Regression', 'Train', 0.82, 0.86, 0.74, 0.90, 0.66, 0.88, 0.70],
    ['Logistic Regression', 'Test', 0.82, 0.86, 0.72, 0.89, 0.66, 0.88, 0.68],

    ['PCA', 'Train', 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00],
    ['PCA', 'Test', 0.77, 0.85, 0.60, 0.83, 0.64, 0.84, 0.62],

    ['Decision Tree', 'Train', 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00],
    ['Decision Tree', 'Test', 0.80, 0.86, 0.65, 0.85, 0.67, 0.86, 0.66],

    ['Max Depth (Pruning)', 'Train', 0.89, 0.90, 0.87, 0.95, 0.75, 0.92, 0.80],
    ['Max Depth (Pruning)', 'Test', 0.84, 0.86, 0.77, 0.92, 0.63, 0.89, 0.69],

    ['Limited Depth (Depth=5)', 'Train', 0.88, 0.89, 0.84, 0.94, 0.75, 0.92, 0.79],
    ['Limited Depth (Depth=5)', 'Test', 0.86, 0.89, 0.79, 0.92, 0.71, 0.90, 0.75],

    ['Limited Depth (Depth=7)', 'Train', 0.90, 0.92, 0.87, 0.95, 0.81, 0.93, 0.84],
    ['Limited Depth (Depth=7)', 'Test', 0.84, 0.87, 0.74, 0.90, 0.68, 0.89, 0.71],

    ['Min Samples per Leaf', 'Train', 0.93, 0.94, 0.89, 0.95, 0.86, 0.95, 0.88],
    ['Min Samples per Leaf', 'Test', 0.80, 0.86, 0.66, 0.86, 0.67, 0.86, 0.66],

    ['Min Samples to Split', 'Train', 0.89, 0.91, 0.85, 0.94, 0.79, 0.92, 0.82],
    ['Min Samples to Split', 'Test', 0.83, 0.87, 0.73, 0.90, 0.67, 0.88, 0.70],

    ['Random Forest', 'Train', 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00],
    ['Random Forest', 'Test', 0.84, 0.88, 0.73, 0.90, 0.70, 0.89, 0.72],

    ['Pruned Random Forest', 'Train', 0.90, 0.91, 0.86, 0.95, 0.79, 0.93, 0.82],
    ['Pruned Random Forest', 'Test', 0.85, 0.88, 0.77, 0.92, 0.70, 0.90, 0.73],

    ['Random Forest (Tuned)', 'Train', 0.90, 0.91, 0.86, 0.95, 0.79, 0.93, 0.82],
    ['Random Forest (Tuned)', 'Test', 0.85, 0.88, 0.77, 0.92, 0.70, 0.90, 0.73]
]

# Create a DataFrame
model_performance_df = pd.DataFrame(data, columns=columns)

# Display the table
model_performance_df


import pandas as pd

# Define the columns for the table
columns = ['Method', 'Accuracy', 'Precision Class 0', 'Precision Class 1',
           'Recall Class 0', 'Recall Class 1', 'F1-Score Class 0', 'F1-Score Class 1']

# Data for each method
data = [
    ['Class Weighting', 0.8464, 0.8807, 0.7537, 0.9063, 0.7003, 0.8933, 0.7260],
    ['SMOTE', 0.8440, 0.8888, 0.7335, 0.8917, 0.7275, 0.8903, 0.7305],
    ['Undersampling', 0.8321, 0.9233, 0.6703, 0.8326, 0.8311, 0.8756, 0.7421],
    ['Hybrid (SMOTE + Undersampling)', 0.7846, 0.9239, 0.5901, 0.7589, 0.8474, 0.8333, 0.6957]
]

# Create a DataFrame
imbalance_handling_df = pd.DataFrame(data, columns=columns)

# Display the table
imbalance_handling_df


from Scikit-learn.preprocessing import PolynomialFeatures

# Generate interaction terms for existing features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Create a DataFrame with the new features
# Use get_feature_names_out() instead of get_feature_names()
X_train_poly_df = pd.DataFrame(X_train_poly, columns=poly.get_feature_names_out(X_train.columns))
X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly.get_feature_names_out(X_test.columns))

# Display the new features
X_train_poly_df.head()


# Defining models to be tested
models = {
    'RandomForest': RandomForestClassifier(random_state=1)
}

# Simplified hyperparameter search space for each model
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [5, 10],
        'min_samples_split': [2, 5],
    },
}

# Running RandomizedSearchCV for the model
for model_name, model in models.items():
    param_grid = param_grids[model_name]
    randomized_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=5, scoring='f1', cv=5, random_state=1)
    randomized_search.fit(X_train_poly_df, y_train)

# Train the model
test_model = randomized_search.best_estimator_
test_model.fit(X_train_poly_df, y_train)

# Test the model
y_pred_train = test_model.predict(X_test_poly_df)

# Evaluate the model on the test set
y_pred_test = test_model.predict(X_test_poly_df)
metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       896
           1       0.78      0.69      0.73       367

    accuracy                           0.85      1263
   macro avg       0.83      0.81      0.82      1263
weighted avg       0.85      0.85      0.85      1263


# Defining models to be tested
models = {
    'GradientBoosting': GradientBoostingClassifier(random_state=1)
}

# Simplified hyperparameter search space for each model
param_grids = {
    'GradientBoosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
    }
}

# Running RandomizedSearchCV for the model
for model_name, model in models.items():
    param_grid = param_grids[model_name]
    randomized_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=5, scoring='f1', cv=5, random_state=1)
    randomized_search.fit(X_train_poly_df, y_train)

# Train the model
test_model = randomized_search.best_estimator_
test_model.fit(X_train_poly_df, y_train)

# Test the model
y_pred_train = test_model.predict(X_test_poly_df)

# Evaluate the model on the test set
y_pred_test = test_model.predict(X_test_poly_df)
metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.88      0.92      0.90       896
           1       0.77      0.68      0.72       367

    accuracy                           0.85      1263
   macro avg       0.82      0.80      0.81      1263
weighted avg       0.84      0.85      0.85      1263


# Calculate the class weights to balance the classes
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train),
                                     y=y_train)

# Create a dictionary for class weights (0 for non-converted, 1 for converted)
weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"Class Weights: {weight_dict}")

# Set the scale_pos_weight for XGBoost
scale_pos_weight = weight_dict[1] / weight_dict[0]

# Initialize the XGBoost classifier with the calculated scale_pos_weight
xgb_model = XGBClassifier(random_state=1, scale_pos_weight=scale_pos_weight)

# Train the XGBoost model
xgb_model.fit(X_train_poly_df, y_train)

# Make predictions on the test set
y_pred_test = xgb_model.predict(X_test_poly_df)

metrics_score(y_test, y_pred_test)

Class Weights: {0: 0.7207537934410181, 1: 1.6324833702882484}
              precision    recall  f1-score   support

           0       0.89      0.86      0.88       896
           1       0.69      0.75      0.72       367

    accuracy                           0.83      1263
   macro avg       0.79      0.80      0.80      1263
weighted avg       0.83      0.83      0.83      1263


from Scikit-learn.metrics import classification_report, roc_curve

# Initialize and train the XGBoost model (without class weights for simplicity here)
xgb_model = XGBClassifier(random_state=1)
xgb_model.fit(X_train_poly_df, y_train)

# Predict probabilities instead of labels
y_pred_proba = xgb_model.predict_proba(X_test_poly_df)[:, 1]  # We only need probabilities for class 1

# Tune the threshold - Here, we set the threshold lower than 0.5 to improve recall
threshold = 0.3   # Lowering the threshold to improve recall
y_pred_custom_threshold = (y_pred_proba >= threshold).astype(int)

# Evaluate the model's performance at the new threshold
metrics_score(y_test, y_pred_custom_threshold)

# Optionally, plot the ROC curve to analyze thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

              precision    recall  f1-score   support

           0       0.91      0.85      0.88       896
           1       0.68      0.80      0.73       367

    accuracy                           0.83      1263
   macro avg       0.80      0.82      0.81      1263
weighted avg       0.84      0.83      0.84      1263


from imblearn.under_sampling import RandomUnderSampler

# Apply Random Undersampling to balance the classes
undersampler = RandomUnderSampler(random_state=1)
X_train_under, y_train_under = undersampler.fit_resample(X_train_poly_df, y_train)

# Train the XGBoost model on the undersampled data
xgb_model = XGBClassifier(random_state=1)
xgb_model.fit(X_train_under, y_train_under)

# Predict on the test set (Note: Test set is not undersampled, as we want to evaluate performance on the original data)
y_pred_test = xgb_model.predict(X_test_poly_df)

# Evaluate the model's performance using classification report
metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.92      0.80      0.86       896
           1       0.63      0.83      0.72       367

    accuracy                           0.81      1263
   macro avg       0.78      0.82      0.79      1263
weighted avg       0.84      0.81      0.82      1263


from imblearn.combine import SMOTETomek

# Apply SMOTE and Random Undersampling (SMOTE + Tomek Links for cleaning)
smote_undersampler = SMOTETomek(random_state=1)
X_train_smote, y_train_smote = smote_undersampler.fit_resample(X_train_poly_df, y_train)

# Train the XGBoost model on the combined SMOTE + undersampled data
xgb_model = XGBClassifier(random_state=1)
xgb_model.fit(X_train_smote, y_train_smote)

# Predict on the test set (Note: Test set is not resampled)
y_pred_test = xgb_model.predict(X_test_poly_df)

# Evaluate the model's performance using classification report
metrics_score(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       0.88      0.88      0.88       896
           1       0.71      0.72      0.71       367

    accuracy                           0.83      1263
   macro avg       0.80      0.80      0.80      1263
weighted avg       0.83      0.83      0.83      1263


from xgboost import XGBClassifier
from Scikit-learn.model_selection import RandomizedSearchCV
from Scikit-learn.metrics import classification_report

# Define hyperparameter grid for XGBoost
param_grid = {
              'n_estimators': [100, 200, 300],    # Number of boosting rounds
              'learning_rate': [0.01, 0.1, 0.2],  # Step size at each boosting step
              'max_depth': [3, 5, 7],             # Maximum depth of a tree
              'colsample_bytree': [0.7, 1.0],     # Subsample ratio of columns when constructing each tree
              'subsample': [0.8, 1.0],            # Subsample ratio of the training instance
              'gamma': [0, 0.1, 0.2]              # Minimum loss reduction required to make a further partition on a leaf node
            }

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(random_state=1)

# Set up RandomizedSearchCV to optimize for recall
random_search = RandomizedSearchCV(
                                    estimator=xgb_model,
                                    param_distributions=param_grid,
                                    n_iter=10,         # Number of different parameter settings to try
                                    scoring='recall',  # Optimize for recall
                                    cv=5,              # 5-fold cross-validation
                                    random_state=1,
                                    verbose=1
                                  )

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train_poly_df, y_train)

# Get the best model after hyperparameter tuning
xgb_model = random_search.best_estimator_

# Evaluate the best model on the test set
y_pred_test = xgb_model.predict(X_test_poly_df)

# Print classification report (to check recall performance)
metrics_score(y_test, y_pred_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
              precision    recall  f1-score   support

           0       0.88      0.89      0.89       896
           1       0.72      0.72      0.72       367

    accuracy                           0.84      1263
   macro avg       0.80      0.80      0.80      1263
weighted avg       0.84      0.84      0.84      1263


# Extract feature importances from the pruned_tree_depth_5 model
importances = pruned_tree_depth_5.feature_importances_
feature_names = X_train.columns  # Assuming X_train is your feature set

# Create a DataFrame to show the importance of each feature
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the top 10 most important features
print("Top 10 Important Features:")
print(feature_importance_df.head(10))

Top 10 Important Features:
                  Feature  Importance
2   time_spent_on_website    0.267570
7     first_interaction_0    0.259239
9     profile_completed_0    0.221637
4    current_occupation_0    0.103402
13        last_activity_1    0.062585
1          website_visits    0.016099
5    current_occupation_1    0.015881
14        last_activity_2    0.014340
0                     age    0.008964
10    profile_completed_1    0.007838


import os
os.getcwd()

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Project_-_Classification_and_Hypothesis_Testing/Learner+Notebook+-+Full+Code+Version+-+Potential+Customers+Prediction.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Project_-_Classification_and_Hypothesis_Testing/Learner+Notebook+-+Full+Code+Version+-+Potential+Customers+Prediction.ipynb to html
[NbConvertApp] Writing 5704577 bytes to /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week_Five_-_Classification_and_Hypothesis_Testing/Project_-_Classification_and_Hypothesis_Testing/Learner+Notebook+-+Full+Code+Version+-+Potential+Customers+Prediction.html

	0
last_activity
0	30.33
1	21.31
2	38.45

	0
profile_completed
0	41.78
1	7.48
2	18.88

	ID	age	current_occupation	first_interaction	profile_completed	website_visits	time_spent_on_website	page_views_per_visit	last_activity	print_media_type1	print_media_type2	digital_media	educational_channels	referral	status
0	EXT001	57	Unemployed	Website	High	7	1639	1.861	Website Activity	Yes	No	Yes	No	No	1
1	EXT002	56	Professional	Mobile App	Medium	2	83	0.320	Website Activity	No	No	No	Yes	No	0
2	EXT003	52	Professional	Website	Medium	3	330	0.074	Website Activity	No	No	Yes	No	No	0
3	EXT004	53	Unemployed	Website	High	4	464	2.057	Website Activity	No	No	No	No	No	1
4	EXT005	23	Student	Website	High	4	600	16.914	Email Activity	No	No	No	No	No	0

	ID	age	current_occupation	first_interaction	profile_completed	website_visits	time_spent_on_website	page_views_per_visit	last_activity	print_media_type1	print_media_type2	digital_media	educational_channels	referral	status
4607	EXT4608	35	Unemployed	Mobile App	Medium	15	360	2.170	Phone Activity	No	No	No	Yes	No	0
4608	EXT4609	55	Professional	Mobile App	Medium	8	2327	5.393	Email Activity	No	No	No	No	No	0
4609	EXT4610	58	Professional	Website	High	2	212	2.692	Email Activity	No	No	No	No	No	1
4610	EXT4611	57	Professional	Mobile App	Medium	1	154	3.879	Website Activity	Yes	No	No	No	No	0
4611	EXT4612	55	Professional	Website	Medium	4	2290	2.075	Phone Activity	No	No	No	No	No	0

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
ID	4612	4612	EXT001	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN
age	4612.0	NaN	NaN	NaN	46.201214	13.161454	18.0	36.0	51.0	57.0	63.0
current_occupation	4612.0	3.0	0.0	2616.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
first_interaction	4612.0	2.0	1.0	2542.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
profile_completed	4612.0	3.0	0.0	2264.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
website_visits	4612.0	NaN	NaN	NaN	3.566782	2.829134	0.0	2.0	3.0	5.0	30.0
time_spent_on_website	4612.0	NaN	NaN	NaN	724.011275	743.828683	0.0	148.75	376.0	1336.75	2537.0
page_views_per_visit	4612.0	NaN	NaN	NaN	3.026126	1.968125	0.0	2.07775	2.792	3.75625	18.434
last_activity	4612.0	3.0	0.0	2278.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
print_media_type1	4612.0	2.0	0.0	4115.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
print_media_type2	4612.0	2.0	0.0	4379.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
digital_media	4612.0	2.0	0.0	4085.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
educational_channels	4612.0	2.0	0.0	3907.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
referral	4612.0	2.0	0.0	4519.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
status	4612.0	2.0	0.0	3235.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Feature	Unique_Count	Unique_Values
0	ID	4612	NaN
1	age	46	NaN
2	current_occupation	3	0, 1, 2
3	first_interaction	2	0, 1
4	profile_completed	3	0, 1, 2
5	website_visits	27	NaN
6	time_spent_on_website	1623	NaN
7	page_views_per_visit	2414	NaN
8	last_activity	3	0, 1, 2
9	print_media_type1	2	0, 1
10	print_media_type2	2	0, 1
11	digital_media	2	0, 1
12	educational_channels	2	0, 1
13	referral	2	0, 1
14	status	2	0, 1

	0
print_media_type1	31.99
print_media_type2	32.19
digital_media	31.88
educational_channels	27.94
referral	67.74

Variable	VIF
age	3.94
website_visits	2.43
time_spent_on_website	1.91
page_views_per_visit	2.96

	age	website_visits	time_spent_on_website	page_views_per_visit	status	current_occupation_0	current_occupation_2	first_interaction_0	first_interaction_1	...	print_media_type1_0	print_media_type1_1	print_media_type2_0	digital_media_0	educational_channels_0	referral_0
4606	58	7	210	3.598	0	0	1	1	0	...	1	0	1	1	1	1
4608	55	8	2327	5.393	0	1	0	1	0	...	1	0	1	1	1	1
4609	58	2	212	2.692	1	1	0	0	1	...	1	0	1	1	1	1
4610	57	1	154	3.879	0	1	0	1	0	...	0	1	1	1	1	1
4611	55	4	2290	2.075	0	1	0	0	1	...	1	0	1	1	1	1

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
age	4208.0	NaN	NaN	NaN	46.360741	13.079381	18.0	36.00000	51.0000	57.00000	63.000
website_visits	4208.0	NaN	NaN	NaN	3.231464	2.157708	0.0	2.00000	3.0000	5.00000	9.000
time_spent_on_website	4208.0	NaN	NaN	NaN	718.396150	742.652073	0.0	142.00000	375.0000	1310.75000	2537.000
page_views_per_visit	4208.0	NaN	NaN	NaN	2.716066	1.490995	0.0	2.06075	2.2855	3.64325	6.266
status	4208.0	2.0	0.0	2939.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN
current_occupation_0	4208.0	NaN	NaN	NaN	0.568679	0.495320	0.0	0.00000	1.0000	1.00000	1.000
current_occupation_1	4208.0	NaN	NaN	NaN	0.115970	0.320226	0.0	0.00000	0.0000	0.00000	1.000
current_occupation_2	4208.0	NaN	NaN	NaN	0.315352	0.464711	0.0	0.00000	0.0000	1.00000	1.000
first_interaction_0	4208.0	NaN	NaN	NaN	0.446530	0.497192	0.0	0.00000	0.0000	1.00000	1.000
first_interaction_1	4208.0	NaN	NaN	NaN	0.553470	0.497192	0.0	0.00000	1.0000	1.00000	1.000
profile_completed_0	4208.0	NaN	NaN	NaN	0.494297	0.500027	0.0	0.00000	0.0000	1.00000	1.000
profile_completed_1	4208.0	NaN	NaN	NaN	0.023051	0.150084	0.0	0.00000	0.0000	0.00000	1.000
profile_completed_2	4208.0	NaN	NaN	NaN	0.482652	0.499758	0.0	0.00000	0.0000	1.00000	1.000
last_activity_0	4208.0	NaN	NaN	NaN	0.496673	0.500048	0.0	0.00000	0.0000	1.00000	1.000
last_activity_1	4208.0	NaN	NaN	NaN	0.263783	0.440736	0.0	0.00000	0.0000	1.00000	1.000
last_activity_2	4208.0	NaN	NaN	NaN	0.239544	0.426856	0.0	0.00000	0.0000	0.00000	1.000
print_media_type1_0	4208.0	NaN	NaN	NaN	0.891635	0.310878	0.0	1.00000	1.0000	1.00000	1.000
print_media_type1_1	4208.0	NaN	NaN	NaN	0.108365	0.310878	0.0	0.00000	0.0000	0.00000	1.000
print_media_type2_0	4208.0	NaN	NaN	NaN	0.948669	0.220698	0.0	1.00000	1.0000	1.00000	1.000
print_media_type2_1	4208.0	NaN	NaN	NaN	0.051331	0.220698	0.0	0.00000	0.0000	0.00000	1.000
digital_media_0	4208.0	NaN	NaN	NaN	0.886882	0.316774	0.0	1.00000	1.0000	1.00000	1.000
digital_media_1	4208.0	NaN	NaN	NaN	0.113118	0.316774	0.0	0.00000	0.0000	0.00000	1.000
educational_channels_0	4208.0	NaN	NaN	NaN	0.848859	0.358229	0.0	1.00000	1.0000	1.00000	1.000
educational_channels_1	4208.0	NaN	NaN	NaN	0.151141	0.358229	0.0	0.00000	0.0000	0.00000	1.000
referral_0	4208.0	NaN	NaN	NaN	0.980513	0.138244	0.0	1.00000	1.0000	1.00000	1.000
referral_1	4208.0	NaN	NaN	NaN	0.019487	0.138244	0.0	0.00000	0.0000	0.00000	1.000

Variable	VIF
Age	3.94
Website Visits	2.43
Time Spent on Website	1.91
Page Views per Visit	2.96

	Pruning Method	Accuracy	Precision Status 0	Precision Status 1	Recall Status 0	Recall Status 1	F1-Score Status 0	F1-Score Status 1
0	Max Depth (Optimal)	0.84	0.86	0.77	0.92	0.63	0.89	0.69
1	Max Depth (5)	0.86	0.89	0.79	0.92	0.71	0.90	0.75
2	Max Depth (7)	0.84	0.87	0.74	0.90	0.68	0.89	0.71
3	Min Samples Leaf (5)	0.80	0.86	0.66	0.86	0.67	0.86	0.66
4	Min Samples Split (48)	0.83	0.87	0.73	0.90	0.67	0.88	0.70

	proportion
status
0	69.371817
1	30.628183

	Method	Accuracy	Precision Class 0	Precision Class 1	Recall Class 0	Recall Class 1	F1-Score Class 0	F1-Score Class 1
0	Class Weighting	0.8464	0.8807	0.7537	0.9063	0.7003	0.8933	0.7260
1	SMOTE	0.8440	0.8888	0.7335	0.8917	0.7275	0.8903	0.7305
2	Undersampling	0.8321	0.9233	0.6703	0.8326	0.8311	0.8756	0.7421
3	Hybrid (SMOTE + Undersampling)	0.7846	0.9239	0.5901	0.7589	0.8474	0.8333	0.6957

Metric	Value
Accuracy	0.86
Precision (Class 1)	0.79
Recall (Class 1)	0.71
F1-Score (Class 1)	0.75

Metric	Value
Accuracy	0.8440
Precision (Class 1)	0.7335
Recall (Class 1)	0.7275
F1-Score (Class 1)	0.7305

	Model	Dataset	Accuracy	Precision Class 0	Precision Class 1	Recall Class 0	Recall Class 1	F1-Score Class 0	F1-Score Class 1
0	Logistic Regression	Train	0.82	0.86	0.74	0.90	0.66	0.88	0.70
1	Logistic Regression	Test	0.82	0.86	0.72	0.89	0.66	0.88	0.68
2	PCA	Train	1.00	1.00	1.00	1.00	1.00	1.00	1.00
3	PCA	Test	0.77	0.85	0.60	0.83	0.64	0.84	0.62
4	Decision Tree	Train	1.00	1.00	1.00	1.00	1.00	1.00	1.00
5	Decision Tree	Test	0.80	0.86	0.65	0.85	0.67	0.86	0.66
6	Max Depth (Pruning)	Train	0.89	0.90	0.87	0.95	0.75	0.92	0.80
7	Max Depth (Pruning)	Test	0.84	0.86	0.77	0.92	0.63	0.89	0.69
8	Limited Depth (Depth=5)	Train	0.88	0.89	0.84	0.94	0.75	0.92	0.79
9	Limited Depth (Depth=5)	Test	0.86	0.89	0.79	0.92	0.71	0.90	0.75
10	Limited Depth (Depth=7)	Train	0.90	0.92	0.87	0.95	0.81	0.93	0.84
11	Limited Depth (Depth=7)	Test	0.84	0.87	0.74	0.90	0.68	0.89	0.71
12	Min Samples per Leaf	Train	0.93	0.94	0.89	0.95	0.86	0.95	0.88
13	Min Samples per Leaf	Test	0.80	0.86	0.66	0.86	0.67	0.86	0.66
14	Min Samples to Split	Train	0.89	0.91	0.85	0.94	0.79	0.92	0.82
15	Min Samples to Split	Test	0.83	0.87	0.73	0.90	0.67	0.88	0.70
16	Random Forest	Train	1.00	1.00	1.00	1.00	1.00	1.00	1.00
17	Random Forest	Test	0.84	0.88	0.73	0.90	0.70	0.89	0.72
18	Pruned Random Forest	Train	0.90	0.91	0.86	0.95	0.79	0.93	0.82
19	Pruned Random Forest	Test	0.85	0.88	0.77	0.92	0.70	0.90	0.73
20	Random Forest (Tuned)	Train	0.90	0.91	0.86	0.95	0.79	0.93	0.82
21	Random Forest (Tuned)	Test	0.85	0.88	0.77	0.92	0.70	0.90	0.73

	age	website_visits	time_spent_on_website	page_views_per_visit	current_occupation_0	current_occupation_1	current_occupation_2	first_interaction_0	first_interaction_1	profile_completed_0	...	digital_media_1 educational_channels_0	digital_media_1 referral_0	educational_channels_0 referral_0
0	48.0	1.0	266.0	5.812	1.0	0.0	0.0	0.0	1.0	1.0	...	0.0	0.0	1.0
1	55.0	3.0	984.0	2.970	0.0	0.0	1.0	1.0	0.0	0.0	...	1.0	1.0	1.0
2	33.0	2.0	2478.0	2.189	0.0	0.0	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0
3	59.0	0.0	0.0	0.000	1.0	0.0	0.0	1.0	0.0	0.0	...	0.0	0.0	1.0
4	23.0	1.0	365.0	1.933	0.0	1.0	0.0	0.0	1.0	1.0	...	1.0	1.0	1.0

ExtraaLearn Project¶

Context¶

Objective¶

Data Description¶

Problem Definition¶

Importing Libraries, Connect to Google Drive and Load Data¶

Function Definitions¶

Data Overview¶

First Five Rows¶

Last Five Rows¶

Shape¶

Dataset Information¶

Data Cleansing¶

Exploratory Data Analysis (EDA)¶

Problem Statement Reminder¶

Summary Statistics¶

Univariate and Outlier Analysis¶

Unique Values¶

Missing Values Check¶

Distribution of Continuous Features¶

Feature: Age¶

Feature: Website Visits¶

Feature: Page Views per Visit¶

Feature: Time Spent on Website¶

Feature: Status¶

Distribution of Categorical Features¶

Feature: Current Occupation¶

Feature First Interaction¶

Feature: Profile Completed¶

Feature: Last Activity¶

Feature: Print Media Type 1¶

Feature: Print Media Type 2¶

Feature: Digital Media¶

Feature: Educational Channels¶

Feature: Referral¶

Question 1:¶

Question 2:¶

Question 3:¶

Question 4:¶

Question 5:¶

Bivariate Analysis¶

Correlation Heatmap¶

Feature: Status vs Age¶

Feature: Status vs Current Occupation¶

Feature: Status vs. First Interaction¶

Feature: Status vs. Profile Completed¶

Feature: Status vs. Website Visits¶

Feature: Status vs. Time Spent on Website¶

Feature: Status vs. Page Views per Visit¶

Feature: Status vs. Last Activity¶

Feature: Status vs. Print Media Type 1¶

Feature: Status vs. Print Media Type 2¶

Feature: Status vs. Digital Media¶

Feature: Status vs. Educational Channels¶

Feature: Status vs. Referral¶

Feature: Age vs Website Visits¶

Feature: Age vs Time Spent on Website¶

Feature: Age vs Page Views per Visit¶

Feature: Website Visits vs Time Spent on Website¶

Feature: Website Visits vs Page Views per Visit¶

Multicollinearity Checks¶

Data Preprocessing¶

Missing Values Check¶

Duplicated Data Check¶

Feature Engineering¶

Drop Index¶

One Hot Encoding¶

Outlier Detection and Treatment¶

EDA¶

First Five Rows¶

Last Five Rows¶

Shape¶

Observations:¶

Univariate Analysis¶

Statistical Information¶

Observations:¶

Univariate Analysis of Continuous Features¶

Observations:¶

Univariate Analysis of Categorical Features¶

Observations:¶