# Import libraries for data manipulation
import numpy as np
import pandas as pd

# Import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Import libraries for statistical analysis
from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import linregress

# Perform math
import math

import warnings
warnings.filterwarnings('ignore')

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Connect collab
from google.colab import drive
drive.mount('/content/drive')

# Load data from csv file
data = pd.read_csv('/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Project Assessment - FoodHub/foodhub_order.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# returns the first 5 rows
data.head()


# returns the last 5 rows
data.tail()


# Determine the number of rows and columns by calling data.shape
data.shape

(1898, 9)


# Use info() to print a concise summary of the DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1162 non-null   float64
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
 9   cuisine_type_encoded   1898 non-null   int64  
 10  total_time             1898 non-null   int64  
 11  total_wait_time        1898 non-null   int64  
 12  commission             1898 non-null   float64
dtypes: float64(3), int64(7), object(3)
memory usage: 192.9+ KB


# Check the statistical summary of the DataFrame
summary = data.describe()

# Minimum food preparation time
food_prep_min = data['food_preparation_time'].min()

# Average (mean) food preparation time
food_prep_avg = round(data['food_preparation_time'].mean(), 2)

# Maximum food preparation time
food_prep_max = data['food_preparation_time'].max()

# Format the summary using pandas Styler
summary_styled = summary.style.set_table_styles(
    [{'selector': 'thead th', 'props': [('background-color', '#f7f7f9'), ('color', '#333'), ('font-weight', 'bold')]}]
).set_properties(**{'text-align': 'center'})

# Display the styled summary
display(summary_styled)

# Create a DataFrame for the food preparation times
prep_times_df = pd.DataFrame({
    'Metric': ['Minimum', 'Average', 'Maximum'],
    'Food Preparation Time (minutes)': [food_prep_min, food_prep_avg, food_prep_max]
})

print("\n")

# Format the preparation times DataFrame using pandas Styler
prep_times_styled = prep_times_df.style.set_table_styles(
    [{'selector': 'thead th', 'props': [('background-color', '#f7f7f9'), ('color', '#333'), ('font-weight', 'bold')]}]
).set_properties(**{'text-align': 'center'})

# Display the styled preparation times DataFrame
display(prep_times_styled)


# Count the number of non-rated orders

# Find unique values for data['rating'] column
unique  = data['rating'].unique()
print (unique)

# It appears non-rated orders are where data['rating'] = unique[0]
non_rated_orders = data[data['rating'] == unique[0]].shape[0]

print(f"Number of orders not rated: {non_rated_orders}")

['Not given' '5' '3' '4']
Number of orders not rated: 736


# Examine the characteristics of each variable

# Analyze the distribution of customer_id

CID_Distribution = data['customer_id'].value_counts()
CID_max = CID_Distribution.max()    # Most frequent customer
CID_min = CID_Distribution.min()    # Least frequent customer(s)
CID_mean = CID_Distribution.mean()  # Average # of visits by customer

# count the number of customers that visited only once and calculate %
CID_once = (CID_Distribution == 1).sum()
CID_once_percent = round((CID_once / CID_Distribution.shape[0]) * 100, 2)

# Plotting the distribution
plt.figure(figsize=(18, 6))

# Histogram of customer visit frequency
plt.figure(figsize=(18, 6))

# Histogram of customer visit frequency
plt.subplot(1, 2, 1)
plt.hist(CID_Distribution, bins=range(1, CID_Distribution.max() + 1), edgecolor='black', color='orange')
plt.axvline(CID_mean, color='red', linestyle='dashed', linewidth=2, label=f'avg # of visits: {CID_mean:.2f}')
plt.title('Customer Visit Frequency')
plt.xlabel('Number of Visits')
plt.ylabel('Number of Customers')
plt.legend()

# Boxplot of customer visit frequency
plt.subplot(1, 2, 2)
plt.boxplot(CID_Distribution, vert=False)
plt.title('Customer Visit Frequency')
plt.xlabel('Number of Visits')

plt.tight_layout()
plt.show()

<Figure size 1800x600 with 0 Axes>


# Analyze the distribution of restaurant_name

unique_restaurants = data['restaurant_name'].nunique()

# Analyze the distribution of orders per restaurant
restaurant_order_distribution = data['restaurant_name'].value_counts()
most_orders = restaurant_order_distribution.max()
least_orders = restaurant_order_distribution.min()
average_orders = restaurant_order_distribution.mean()

# Names of the most and least popular restaurants
most_popular_restaurant = restaurant_order_distribution.idxmax()
least_popular_restaurant = restaurant_order_distribution.idxmin()

# Count the number of restaurants with only 1 order
restaurants_with_one_order = (restaurant_order_distribution == 1).sum()

# Get the top 75 restaurants by number of orders (more than this will make the x axis labels illegible)
top_75_restaurants = restaurant_order_distribution.head(50)

# Visualize the distribution of orders for the top 75 restaurants
plt.figure(figsize=(18, 6))
top_75_restaurants.plot(kind='bar')
plt.title('Top 75 Restaurants by Number of Orders')
plt.xlabel('Restaurant Name')
plt.ylabel('Number of Orders')

# Rotate x-axis labels for better readability
plt.xticks(rotation=90, fontsize=8)
plt.show()

# Set display options for wider columns
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 300)
pd.set_option('display.max_colwidth', None)


# Analyze the distribution of cuisine types
cuisine_type_distribution = data['cuisine_type'].value_counts()

# Calculate the most and least popular cuisine types
most_popular_cuisine = cuisine_type_distribution.idxmax()
least_popular_cuisine = cuisine_type_distribution.idxmin()

# Visualize the distribution of cuisine types
plt.figure(figsize=(18, 6))
cuisine_type_distribution.plot(kind='bar')
plt.title('Distribution of Cuisine Types')
plt.xlabel('Cuisine Type')
plt.ylabel('Number of Orders')
plt.xticks(rotation=90, fontsize=8)
plt.show()


# Analyze the distribution of order costs
order_cost_distribution = data['cost_of_the_order']

# Calculate the statistics
max_cost = order_cost_distribution.max()
min_cost = order_cost_distribution.min()
mean_cost = order_cost_distribution.mean()

# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))

# Histogram of order costs with a vertical line for the mean cost
axes[0].hist(order_cost_distribution, bins=30, edgecolor='black')
axes[0].axvline(mean_cost, color='red', linestyle='dashed', linewidth=2, label=f'avg cost of the order: ${mean_cost:.2f}')
axes[0].set_title('Distribution of Order Costs')
axes[0].set_xlabel('Order Cost')
axes[0].set_ylabel('Number of Orders')
axes[0].legend()

# Box plot for the order costs
axes[1].boxplot(order_cost_distribution, vert=False)
axes[1].set_title('Box Plot of Order Costs')
axes[1].set_xlabel('Order Cost')

# Adjust layout
plt.tight_layout()
plt.show()


# Analyze the distribution of orders by the day of the week
day_of_week_distribution = data['day_of_the_week'].value_counts()

# Visualize the distribution of orders by the day of the week
plt.figure(figsize=(18, 6))
day_of_week_distribution.plot(kind='bar')
plt.title('Distribution of Orders by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Orders')
plt.xticks(rotation=0, fontsize=8)
plt.show()


# Analyze the distribution of ratings
rating_distribution = data['rating'].value_counts()

# Visualize the distribution of ratings
plt.figure(figsize=(18, 6))
rating_distribution.plot(kind='bar')
plt.title('Distribution of Customer Ratings')
plt.xlabel('Rating')
plt.ylabel('Number of Orders')
plt.xticks(rotation=0, fontsize=8)
plt.show()


# Analyze the distribution of food preparation times
prep_time_distribution = data['food_preparation_time']

# Calculate the statistics
max_prep_time = prep_time_distribution.max()
min_prep_time = prep_time_distribution.min()
mean_prep_time = prep_time_distribution.mean()

# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))

# Histogram of food preparation times with a vertical line for the mean time
axes[0].hist(prep_time_distribution, bins=30, edgecolor='black')
axes[0].axvline(mean_prep_time, color='red', linestyle='dashed', linewidth=2, label=f'avg prep time: {mean_prep_time:.2f} mins')
axes[0].set_title('Distribution of Food Preparation Times')
axes[0].set_xlabel('Preparation Time (minutes)')
axes[0].set_ylabel('Number of Orders')
axes[0].legend()

# Box plot for the food preparation times
axes[1].boxplot(prep_time_distribution, vert=False)
axes[1].set_title('Box Plot of Food Preparation Times')
axes[1].set_xlabel('Preparation Time (minutes)')

# Adjust layout
plt.tight_layout()
plt.show()


# Analyze the distribution of delivery times
delivery_time_distribution = data['delivery_time']

# Calculate the statistics
max_delivery_time = delivery_time_distribution.max()
min_delivery_time = delivery_time_distribution.min()
mean_delivery_time = delivery_time_distribution.mean()

# Create subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 6))

# Histogram of delivery times with a vertical line for the mean time
axes[0].hist(delivery_time_distribution, bins=30, edgecolor='black')
axes[0].axvline(mean_delivery_time, color='red', linestyle='dashed', linewidth=2, label=f'avg delivery time: {mean_delivery_time:.2f} mins')
axes[0].set_title('Distribution of Delivery Times')
axes[0].set_xlabel('Delivery Time (minutes)')
axes[0].set_ylabel('Number of Orders')
axes[0].legend()

# Box plot for the delivery times
axes[1].boxplot(delivery_time_distribution, vert=False)
axes[1].set_title('Box Plot of Delivery Times')
axes[1].set_xlabel('Delivery Time (minutes)')

# Adjust layout
plt.tight_layout()
plt.show()


# Get the top 5 restaurants by number of orders
top_5_restaurants = restaurant_order_distribution.head(5)
#top_5_restaurants

# Convert the series to a DataFrame for better formatting
top_5_restaurants_df = top_5_restaurants.reset_index()
top_5_restaurants_df.columns = ['Restaurant Name', 'Number of Orders']

# Convert the series to a DataFrame for better formatting
top_5_restaurants_df = top_5_restaurants.reset_index()
top_5_restaurants_df.columns = ['Restaurant Name', 'Number of Orders']

# Format the table using pandas Styler
top_5_restaurants_styled = top_5_restaurants_df.style.set_caption("Top 5 Restaurants by Number of Orders<br><br>").set_table_styles(
    [{'selector': 'caption', 'props': [('caption-side', 'top'), ('font-size', '16px'), ('font-weight', 'bold'), ('text-align', 'center')]},
     {'selector': 'thead th', 'props': [('background-color', '#f7f7f9'), ('color', '#333'), ('font-weight', 'bold')]}]
).set_properties(subset=['Restaurant Name'], **{'text-align': 'left'}).set_properties(subset=['Number of Orders'], **{'text-align': 'center'}).hide(axis="index")

# Display the styled table
top_5_restaurants_styled


# Filter the data for weekends
weekend_data = data[data['day_of_the_week'] == 'Weekend']

# Analyze the distribution of cuisine types on weekends
weekend_cuisine_distribution = weekend_data['cuisine_type'].value_counts()

# Get the most popular cuisine on weekends
most_popular_cuisine_weekend = weekend_cuisine_distribution.idxmax()
most_popular_cuisine_count = weekend_cuisine_distribution.max()

# Get the second most popular cuisine on weekends (for comparison)
second_most_popular_cuisine_weekend = weekend_cuisine_distribution.index[1]
second_most_popular_cuisine_count = weekend_cuisine_distribution.iloc[1]

# Create a DataFrame for the most popular weekend cuisine
most_popular_cuisine_df = pd.DataFrame({
    'Cuisine Type': [most_popular_cuisine_weekend],
    'Number of Orders': [most_popular_cuisine_count]
})

# Format the table using pandas Styler
most_popular_cuisine_styled = most_popular_cuisine_df.style.set_caption("Most Popular Cuisine on Weekends<br><br>").set_table_styles(
    [{'selector': 'caption', 'props': [('caption-side', 'top'), ('font-size', '16px'), ('font-weight', 'bold'), ('text-align', 'center')]},
     {'selector': 'thead th', 'props': [('background-color', '#f7f7f9'), ('color', '#333'), ('font-weight', 'bold')]}]
).set_properties(subset=['Cuisine Type'], **{'text-align': 'left'}).set_properties(subset=['Number of Orders'], **{'text-align': 'center'}).hide(axis="index")

# Display the styled table
most_popular_cuisine_styled


# Calculate the percentage of orders that cost more than 20 dollars
total_orders = data.shape[0]
orders_above_20 = data[data['cost_of_the_order'] > 20].shape[0]
percentage_above_20 = (orders_above_20 / total_orders) * 100

# Print the result
print(f"Percentage of orders that cost more than 20 dollars: {percentage_above_20:.2f}%")

Percentage of orders that cost more than 20 dollars: 29.24%


# Prevoiusly calculate mean_delivery_time
print("The mean order delivery time is %.2f minutes." % mean_delivery_time)

The mean order delivery time is 24.16 minutes.


# Top three customers using the previously calculate Customer ID Distribution
top_3_customers = CID_Distribution.head(3).reset_index()
top_3_customers.columns = ['Customer ID', 'Number of Orders']

# Format the table using pandas Styler
top_3_customers_styled = top_3_customers.style.set_caption("Top 3 Customers by Number of Orders<br><br>").set_table_styles(
    [{'selector': 'caption', 'props': [('caption-side', 'top'), ('font-size', '16px'), ('font-weight', 'bold'), ('text-align', 'center')]},
     {'selector': 'thead th', 'props': [('background-color', '#f7f7f9'), ('color', '#333'), ('font-weight', 'bold')]}]
).set_properties(subset=['Customer ID'], **{'text-align': 'left'}).set_properties(subset=['Number of Orders'], **{'text-align': 'center'}).hide(axis="index")

# Display the styled table
top_3_customers_styled


# Correlation matrix of numerical variables

# Select only numerical columns
numerical_data = data.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
corr_matrix = numerical_data.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='vlag')
plt.title('Correlation Matrix of Numerical Variables')
plt.show()


# Calculate the frequency of orders per customer
order_frequency = data['customer_id'].value_counts().reset_index()
order_frequency.columns = ['customer_id', 'order_count']

# Style the table using pandas Styler
styled_table = order_frequency.style.background_gradient(cmap='viridis').set_caption("Frequency of Orders per Customer")

# Display the styled table
styled_table

# Plotting the frequency of orders per customer
plt.figure(figsize=(10, 6))
plt.hist(order_frequency['order_count'], bins=30, edgecolor='black')
plt.title('Frequency of Orders per Customer')
plt.xlabel('Number of Orders')
plt.ylabel('Number of Customers')
plt.show()

# Pearson correlation between 'cost_of_the_order' and 'delivery_time'
correlation_score = data['cost_of_the_order'].corr(data['delivery_time'])
print(f"Pearson Correlation Score between 'cost_of_the_order' and 'delivery_time': {correlation_score:.3f}")

Pearson Correlation Score between 'cost_of_the_order' and 'delivery_time': -0.030


# Calculate the average order value per customer
average_order_value = data.groupby('customer_id')['cost_of_the_order'].mean().reset_index()
average_order_value.columns = ['customer_id', 'average_order_value']

# Style the table using pandas Styler
styled_average_order_value = average_order_value.style.background_gradient(cmap='viridis').set_caption("Average Order Value per Customer")

# Display the styled table
styled_average_order_value

# Plotting the average order value per customer
plt.figure(figsize=(10, 6))
plt.hist(average_order_value['average_order_value'], bins=30, edgecolor='black')
plt.title('Average Order Value per Customer')
plt.xlabel('Average Order Value')
plt.ylabel('Number of Customers')
plt.show()


# Calculate the percentage of repeat customers
total_customers = data['customer_id'].nunique()
repeat_customers = data['customer_id'].value_counts()[data['customer_id'].value_counts() > 1].count()
percentage_repeat_customers = (repeat_customers / total_customers) * 100

# Display the percentage of repeat customers
print(f"Percentage of Repeat Customers: {percentage_repeat_customers:.2f}%")

# Plotting the percentage of repeat customers
labels = ['Repeat Customers', 'One-time Customers']
sizes = [percentage_repeat_customers, 100 - percentage_repeat_customers]
colors = ['#ff9999','#66b3ff']
explode = (0.1, 0)  # explode the first slice

plt.figure(figsize=(8, 8))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.title('Percentage of Repeat Customers')
plt.show()

Percentage of Repeat Customers: 34.67%


# Identify repeat and one-time customers
repeat_customers_ids = data['customer_id'].value_counts()[data['customer_id'].value_counts() > 1].index
one_time_customers_ids = data['customer_id'].value_counts()[data['customer_id'].value_counts() == 1].index

# Separate data for repeat and one-time customers
repeat_customers_data = data[data['customer_id'].isin(repeat_customers_ids)]
one_time_customers_data = data[data['customer_id'].isin(one_time_customers_ids)]

# Calculate average rating and order cost for repeat customers
repeat_customers_avg_rating = repeat_customers_data[repeat_customers_data['rating'] != 'Not given']['rating'].astype(float).mean()
repeat_customers_avg_order_cost = repeat_customers_data['cost_of_the_order'].mean()

# Calculate average rating and order cost for one-time customers
one_time_customers_avg_rating = one_time_customers_data[one_time_customers_data['rating'] != 'Not given']['rating'].astype(float).mean()
one_time_customers_avg_order_cost = one_time_customers_data['cost_of_the_order'].mean()

# Display the results
print(f"Average Rating for Repeat Customers: {repeat_customers_avg_rating:.2f}")
print(f"Average Order Cost for Repeat Customers: ${repeat_customers_avg_order_cost:.2f}")
print(f"Average Rating for One-time Customers: {one_time_customers_avg_rating:.2f}")
print(f"Average Order Cost for One-time Customers: ${one_time_customers_avg_order_cost:.2f}")

# Plotting the results
categories = ['Repeat Customers', 'One-time Customers']
avg_ratings = [repeat_customers_avg_rating, one_time_customers_avg_rating]
avg_order_costs = [repeat_customers_avg_order_cost, one_time_customers_avg_order_cost]

fig, ax1 = plt.subplots(figsize=(10, 6))

color = 'tab:blue'
ax1.set_xlabel('Customer Type')
ax1.set_ylabel('Average Rating', color=color)
ax1.bar(categories, avg_ratings, color=color, alpha=0.6, label='Average Rating')
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:green'
ax2.set_ylabel('Average Order Cost ($)', color=color)
ax2.plot(categories, avg_order_costs, color=color, marker='o', linestyle='-', linewidth=2, markersize=8, label='Average Order Cost')
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title('Average Rating and Order Cost for Repeat Customers vs One-time Customers')
fig.legend(loc='upper right', bbox_to_anchor=(0.9,0.9))
plt.show()

Average Rating for Repeat Customers: 4.34
Average Order Cost for Repeat Customers: $16.49
Average Rating for One-time Customers: 4.35
Average Order Cost for One-time Customers: $16.52


# Box Plot of Order Cost by Cuisine Type

plt.figure(figsize=(12, 6))
sns.boxplot(x='cuisine_type', y='cost_of_the_order', data=data)
plt.title('Box Plot of Order Cost by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Order Cost ($)')
plt.xticks(rotation=90)
plt.show()

# Encode the 'cuisine_type' column
data['cuisine_type_encoded'] = data['cuisine_type'].astype('category').cat.codes

# Calculate the Pearson correlation coefficient between encoded cuisine type and order cost
correlation_cost_cuisine = data['cuisine_type_encoded'].corr(data['cost_of_the_order'])

# Print the Pearson correlation coefficient
print(f"Pearson correlation coefficient between encoded cuisine type and order cost: {correlation_cost_cuisine:.2f}")

Pearson correlation coefficient between encoded cuisine type and order cost: 0.04


# Create a pivot table for average order cost by day of the week and cuisine type
pivot_table = data.pivot_table(values='cost_of_the_order', index='day_of_the_week', columns='cuisine_type', aggfunc='mean')

# Visualize the pivot table as a heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, annot=True, cmap='Reds')
plt.title('Average Order Cost by Day of the Week and Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Day of the Week')
plt.show()

# Manually encode the 'cuisine_type' column
data['cuisine_type_encoded'] = data['cuisine_type'].astype('category').cat.codes

# Calculate the Pearson correlation coefficient between encoded cuisine type and order cost
correlation = data['cuisine_type_encoded'].corr(data['cost_of_the_order'])

# Print the result
print(f"Pearson correlation coefficient between encoded cuisine type and order cost: {correlation:.2f}")

Pearson correlation coefficient between encoded cuisine type and order cost: 0.04


# Create a pivot table for order frequency by Weekday and Weekend and cuisine type
order_frequency = data.pivot_table(values='order_id', index='day_of_the_week', columns='cuisine_type', aggfunc='count')

# Visualize the heatmap with adjusted plot width to avoid using exponents
plt.figure(figsize=(14, 8))
sns.heatmap(order_frequency, annot=True, fmt='d', cmap='Reds')
plt.title('Heatmap of Order Frequency by Weekday/Weekend and Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Weekday/Weekend')
plt.show()


# Calculate total time (preparation time + delivery time)
data['total_time'] = data['food_preparation_time'] + data['delivery_time']

# Plotting Order Cost by Preparation Time
plt.figure(figsize=(10, 6))
plt.scatter(data['cost_of_the_order'], data['food_preparation_time'], alpha=0.6, edgecolors='w', linewidth=0.5)
plt.title('Order Cost by Preparation Time')
plt.xlabel('Order Cost ($)')
plt.ylabel('Preparation Time (minutes)')
plt.grid(True)
plt.show()

# Plotting Order Cost by Delivery Time
plt.figure(figsize=(10, 6))
plt.scatter(data['cost_of_the_order'], data['delivery_time'], alpha=0.6, edgecolors='w', linewidth=0.5)
plt.title('Order Cost by Delivery Time')
plt.xlabel('Order Cost ($)')
plt.ylabel('Delivery Time (minutes)')
plt.grid(True)
plt.show()

# Plotting Order Cost by Total Time
plt.figure(figsize=(10, 6))
plt.scatter(data['cost_of_the_order'], data['total_time'], alpha=0.6, edgecolors='w', linewidth=0.5)
plt.title('Order Cost by Total Time')
plt.xlabel('Order Cost ($)')
plt.ylabel('Total Time (minutes)')
plt.grid(True)
plt.show()

# Calculate Pearson correlation scores
pearson_cost_prep_time = data['cost_of_the_order'].corr(data['food_preparation_time'])
pearson_cost_delivery_time = data['cost_of_the_order'].corr(data['delivery_time'])
pearson_cost_total_time = data['cost_of_the_order'].corr(data['total_time'])

print(f"Pearson Correlation Score between 'cost_of_the_order' and 'food_preparation_time': {pearson_cost_prep_time:.3f}")
print(f"Pearson Correlation Score between 'cost_of_the_order' and 'delivery_time': {pearson_cost_delivery_time:.3f}")
print(f"Pearson Correlation Score between 'cost_of_the_order' and 'total_time': {pearson_cost_total_time:.3f}")

Pearson Correlation Score between 'cost_of_the_order' and 'food_preparation_time': 0.042
Pearson Correlation Score between 'cost_of_the_order' and 'delivery_time': -0.030
Pearson Correlation Score between 'cost_of_the_order' and 'total_time': 0.006


from scipy.stats import linregress

# Calculate the frequency of orders per customer
order_frequency = data['customer_id'].value_counts().reset_index()
order_frequency.columns = ['customer_id', 'order_count']

# Merge the order frequency with the original dataset to get order costs
merged_data = pd.merge(data, order_frequency, on='customer_id')

# Calculate the average order cost for different order frequencies
avg_cost_by_frequency = merged_data.groupby('order_count')['cost_of_the_order'].mean().reset_index()
avg_cost_by_frequency.columns = ['order_count', 'average_order_cost']

# Style the table using pandas Styler
styled_avg_cost_by_frequency = avg_cost_by_frequency.style.background_gradient(cmap='viridis').set_caption("Average Order Cost by Order Frequency")

# Display the styled table
styled_avg_cost_by_frequency

# Plotting the average order cost by order frequency with trendline
plt.figure(figsize=(10, 6))
plt.plot(avg_cost_by_frequency['order_count'], avg_cost_by_frequency['average_order_cost'], marker='o', linestyle='-', linewidth=2, markersize=8, label='Average Order Cost')

# Add a trendline
slope, intercept, r_value, p_value, std_err = linregress(avg_cost_by_frequency['order_count'], avg_cost_by_frequency['average_order_cost'])
plt.plot(avg_cost_by_frequency['order_count'], intercept + slope * avg_cost_by_frequency['order_count'], 'r', label=f'Trendline (slope={slope:.2f}, p-value={p_value:.2f})')

plt.title('Average Order Cost by Order Frequency')
plt.xlabel('Order Frequency')
plt.ylabel('Average Order Cost ($)')
plt.legend()
plt.grid(True)
plt.show()

# Display the results
#print(avg_cost_by_frequency)

# Pearson correlation for detailed analysis
correlation_score = avg_cost_by_frequency['order_count'].corr(avg_cost_by_frequency['average_order_cost'])
print(f"Pearson Correlation Score between 'order_count' and 'average_order_cost': {correlation_score:.3f}")

Pearson Correlation Score between 'order_count' and 'average_order_cost': 0.186


# Calculate total time (preparation time + delivery time)
data['total_time'] = data['food_preparation_time'] + data['delivery_time']

# Group by day of the week and calculate average times
avg_times_by_day = data.groupby('day_of_the_week').agg({
    'food_preparation_time': 'mean',
    'delivery_time': 'mean',
    'total_time': 'mean'
}).reset_index()

# Style the table using pandas Styler
styled_avg_times_by_day = avg_times_by_day.style.background_gradient(cmap='viridis').set_caption("Average Times by Day of the Week")

# Display the styled table
styled_avg_times_by_day

# Define the x-axis labels with weekday/weekend context
x_labels = ['Weekday', 'Weekend']

# Plotting the average times by day of the week side by side
fig, ax = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

# Preparation Time
ax[0].bar(avg_times_by_day['day_of_the_week'], avg_times_by_day['food_preparation_time'], color='skyblue')
ax[0].set_title('Average Preparation Time')
#ax[0].set_xlabel('Day of the Week')
ax[0].set_ylabel('Time (minutes)')
ax[0].grid(True)
ax[0].set_xticks(range(len(avg_times_by_day['day_of_the_week'])))
ax[0].set_xticklabels(x_labels)

# Delivery Time
ax[1].bar(avg_times_by_day['day_of_the_week'], avg_times_by_day['delivery_time'], color='lightgreen')
ax[1].set_title('Average Delivery Time')
ax[1].set_xlabel('Day of the Week')
ax[1].grid(True)
ax[1].set_xticks(range(len(avg_times_by_day['day_of_the_week'])))
ax[1].set_xticklabels(x_labels)

# Total Time
ax[2].bar(avg_times_by_day['day_of_the_week'], avg_times_by_day['total_time'], color='salmon')
ax[2].set_title('Average Total Time')
#ax[2].set_xlabel('Day of the Week')
ax[2].grid(True)
ax[2].set_xticks(range(len(avg_times_by_day['day_of_the_week'])))
ax[2].set_xticklabels(x_labels)

plt.tight_layout()
plt.show()


# Calculate the order frequency by day of the week
order_frequency_by_day = data['day_of_the_week'].value_counts().reset_index()
order_frequency_by_day.columns = ['day_of_the_week', 'order_count']

# Style the table using pandas Styler
styled_order_frequency_by_day = order_frequency_by_day.style.background_gradient(cmap='viridis').set_caption("Order Frequency by Day of the Week")

# Display the styled table
styled_order_frequency_by_day

# Define the x-axis labels with weekday/weekend context
x_labels = ['Weekday', 'Weekend']

# Plotting the order frequency by day of the week
plt.figure(figsize=(10, 6))
plt.bar(order_frequency_by_day['day_of_the_week'], order_frequency_by_day['order_count'], color='lightblue')
plt.title('Order Frequency by Day of the Week')
plt.xlabel('Day of the Week (Weekday/Weekend)')
plt.ylabel('Order Count')
plt.xticks(ticks=range(len(order_frequency_by_day['day_of_the_week'])), labels=x_labels)
plt.grid(True)
plt.show()


# Plot of Customer Ratings by Cuisine Type

# Data Cleaning: Replace 'Not given' ratings with NaN and drop those rows
data['rating'] = data['rating'].replace('Not given', pd.NA).dropna().astype(int)

# Calculate the average ratings by cuisine type
average_ratings = data.groupby('cuisine_type')['rating'].mean().reset_index()

# Count the 'Not given' ratings by cuisine type
not_given_ratings = data['rating'].replace('Not given', pd.NA).isna().groupby(data['cuisine_type']).sum().reset_index(name='Not_given_count')

# Merge both dataframes
combined_data = pd.merge(average_ratings, not_given_ratings, on='cuisine_type')

# Plot the combined data
fig, ax1 = plt.subplots(figsize=(14, 8))

# Plot average ratings
ax1.set_xlabel('Cuisine Type')
ax1.set_ylabel('Average Rating', color='tab:blue')
ax1.bar(combined_data['cuisine_type'], combined_data['rating'], color='skyblue', label='Average Rating')
ax1.tick_params(axis='y', labelcolor='tab:blue')
ax1.set_xticklabels(combined_data['cuisine_type'], rotation=45, ha='right')

# Create a secondary y-axis to plot 'Not given' counts
ax2 = ax1.twinx()
ax2.set_ylabel('Count of "Not given" Ratings', color='tab:red')
ax2.plot(combined_data['cuisine_type'], combined_data['Not_given_count'], color='salmon', marker='o', label='Not Given Count')
ax2.tick_params(axis='y', labelcolor='tab:red')

# Title and layout
plt.title('Average Customer Ratings and Count of "Not given" Ratings by Cuisine Type')
fig.tight_layout()
plt.show()

# Display the combined data table summary using pandas Styler
combined_data_styled = combined_data.style.set_caption("Summary of Average Ratings and Count of 'Not given' Ratings by Cuisine Type")
combined_data_styled

# Encode the 'cuisine_type' column using pandas' factorize
data['cuisine_type_encoded'] = pd.factorize(data['cuisine_type'])[0]

# Ensure no NaNs or infinite values in the columns
cleaned_data = data.dropna(subset=['cuisine_type_encoded', 'rating'])

# Calculate the Pearson correlation coefficient using scipy
pearson_corr_cuisine_rating, _ = pearsonr(cleaned_data['cuisine_type_encoded'], cleaned_data['rating'])

# Print the Pearson correlation coefficient rounded to two decimal places
print(f"The Pearson correlation coefficient between cuisine type and rating is {pearson_corr_cuisine_rating:.2f}")

# Calculate the average ratings by cuisine type
average_ratings = cleaned_data.groupby('cuisine_type')['rating'].mean().reset_index()

# Count the 'Not given' ratings by cuisine type
not_given_ratings = data['rating'].replace('Not given', pd.NA).isna().groupby(data['cuisine_type']).sum().reset_index(name='Not_given_count')

# Merge both dataframes
combined_data = pd.merge(average_ratings, not_given_ratings, on='cuisine_type')

# Sort the combined data by average rating in descending order
combined_data_sorted = combined_data.sort_values(by='rating', ascending=False)

# Display the sorted combined data table summary using pandas Styler
combined_data_sorted_styled = combined_data_sorted.style.set_caption("Summary of Average Ratings and Count of 'Not given' Ratings by Cuisine Type (Sorted by Rating)")
combined_data_sorted_styled

The Pearson correlation coefficient between cuisine type and rating is -0.01


# Rating Distribution by Order Cost

# Original unreadable plot
#plt.figure(figsize=(12, 6))
#sns.boxplot(x='cost_of_the_order', y='rating', data=data)
#plt.title('Box Plot of Customer Ratings by Order Cost')
#plt.xlabel('Order Cost ($)')
#plt.ylabel('Customer Rating')
#plt.xticks(rotation=90)
#plt.show()

# To make this readable we will first need to bin the data and then create the plot

# Bin the order costs into categories
bin_labels = ['< $10', '$10 - $20', '$20 - $30', '$30 - $40', '$40 - $50', '> $50']
clean_data = data.copy()
clean_data['cost_bin'] = pd.cut(clean_data['cost_of_the_order'], bins=[0, 10, 20, 30, 40, 50, float('inf')], labels=bin_labels)

# Calculate the mean and median for each bin
mean_ratings = clean_data.groupby('cost_bin')['rating'].mean()
median_ratings = clean_data.groupby('cost_bin')['rating'].median()

# Plotting a box plot with binned order costs
plt.figure(figsize=(10, 6))
sns.boxplot(x='cost_bin', y='rating', data=clean_data)

# Add mean and median points
plt.plot(mean_ratings.index, mean_ratings.values, color='red', marker='o', linestyle='-', label='Mean')
plt.plot(median_ratings.index, median_ratings.values, color='blue', marker='x', linestyle='-', label='Median')

# Customize the plot
plt.title('Box Plot of Customer Ratings by Binned Order Cost')
plt.xlabel('Order Cost Bins')
plt.ylabel('Customer Rating')
plt.legend()
plt.show()


# Rating Distribution by Total Wait Time (Prep Time + Delivery Time)

# Data Cleaning: Replace 'Not given' ratings with NaN and drop those rows
data['rating'] = data['rating'].replace('Not given', pd.NA).dropna().astype(int)

# Calculate the total wait time
data['total_wait_time'] = data['food_preparation_time'] + data['delivery_time']

# Ensure no NaNs or infinite values in the columns
cleaned_data = data.dropna(subset=['total_wait_time', 'rating'])

# Scatter Plot - this wasnt very intuitive to visualize in this way
#plt.figure(figsize=(12, 6))
#sns.scatterplot(x='total_wait_time', y='rating', data=cleaned_data)
#plt.title('Rating Distribution by Total Wait Time (Prep Time + Delivery Time)')
#plt.xlabel('Total Wait Time (minutes)')
#plt.ylabel('Rating')
#plt.show()

# Hexbin Plot
plt.figure(figsize=(12, 6))
hb = plt.hexbin(cleaned_data['total_wait_time'], cleaned_data['rating'], gridsize=30, cmap='viridis', mincnt=1)
plt.colorbar(hb, label='Count')
plt.title('Rating Distribution by Total Wait Time (Prep Time + Delivery Time)')
plt.xlabel('Total Wait Time (minutes)')
plt.ylabel('Rating')
plt.show()

# Correlation Analysis
correlation, _ = pearsonr(cleaned_data['total_wait_time'], cleaned_data['rating'])
print(f"Pearson correlation coefficient between total wait time and rating: {correlation:.2f}")

Pearson correlation coefficient between total wait time and rating: -0.01


# Calculate average rating by day of the week
# First, exclude rows where rating is 'Not given'
filtered_data = data[data['rating'] != 'Not given']
filtered_data['rating'] = filtered_data['rating'].astype(float)

avg_ratings_by_day = filtered_data.groupby('day_of_the_week')['rating'].mean().reset_index()
avg_ratings_by_day.columns = ['day_of_the_week', 'average_rating']

# Style the table using pandas Styler
styled_avg_ratings_by_day = avg_ratings_by_day.style.background_gradient(cmap='viridis').set_caption("Average Ratings by Day of the Week")

# Display the styled table
styled_avg_ratings_by_day

# Define the x-axis labels with weekday/weekend context
x_labels = ['Weekday', 'Weekend']

# Plotting the average ratings by day of the week
plt.figure(figsize=(10, 6))
plt.bar(avg_ratings_by_day['day_of_the_week'], avg_ratings_by_day['average_rating'], color='lightcoral')
plt.title('Average Ratings by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Average Rating')
plt.xticks(ticks=range(len(avg_ratings_by_day['day_of_the_week'])), labels=x_labels)
plt.ylim(0, 5)
plt.grid(True)
plt.show()


# Calculate order frequency for each customer
order_frequency = data['customer_id'].value_counts().reset_index()
order_frequency.columns = ['customer_id', 'order_count']

# Merge the order frequency with the original dataset to get ratings
merged_data = pd.merge(data, order_frequency, on='customer_id')

# Filter out rows where rating is 'Not given'
filtered_merged_data = merged_data[merged_data['rating'] != 'Not given']
filtered_merged_data['rating'] = filtered_merged_data['rating'].astype(float)

# Calculate the average rating for different order frequencies
avg_rating_by_frequency = filtered_merged_data.groupby('order_count')['rating'].mean().reset_index()
avg_rating_by_frequency.columns = ['order_count', 'average_rating']

# Style the table using pandas Styler
styled_avg_rating_by_frequency = avg_rating_by_frequency.style.background_gradient(cmap='viridis').set_caption("Average Rating by Order Frequency")

# Display the styled table
styled_avg_rating_by_frequency

# Plotting the average rating by order frequency
plt.figure(figsize=(10, 6))
plt.plot(avg_rating_by_frequency['order_count'], avg_rating_by_frequency['average_rating'], marker='o', linestyle='-', color='blue')
plt.title('Order Frequency by Average Rating')
plt.xlabel('Order Frequency')
plt.ylabel('Average Rating')
plt.ylim(0, 5)
plt.grid(True)
plt.show()


# Delivery time vs prep time

# Calculate the Pearson correlation coefficient
correlation = data['food_preparation_time'].corr(data['delivery_time'])

#Facet grid
g = sns.FacetGrid(data, col="cuisine_type", col_wrap=4, height=4)
g.map(sns.scatterplot, "food_preparation_time", "delivery_time", alpha=0.5)
g.set_titles("{col_name}")
g.set_axis_labels("Food Preparation Time (minutes)", "Delivery Time (minutes)")
plt.show()

# Print the result
print(f"Pearson correlation coefficient between food preparation time and delivery time: {correlation:.2f}")

Pearson correlation coefficient between food preparation time and delivery time: 0.01


# Box Plot of Delivery Time by Cuisine Type
plt.figure(figsize=(12, 6))
sns.boxplot(x='cuisine_type', y='delivery_time', data=data)
plt.title('Box Plot of Delivery Time by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Delivery Time (minutes)')
plt.xticks(rotation=90)
plt.show()


# Box Plot of Food Preparation Time by Cuisine Type
plt.figure(figsize=(12, 6))
sns.boxplot(x='cuisine_type', y='food_preparation_time', data=data)
plt.title('Box Plot of Food Preparation Time by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Food Preparation Time (minutes)')
plt.xticks(rotation=90)
plt.show()


# Calculate total time (preparation time + delivery time)
data['total_time'] = data['food_preparation_time'] + data['delivery_time']

# Group by cuisine type and calculate average total time
avg_total_time_by_cuisine = data.groupby('cuisine_type')['total_time'].mean().reset_index()
avg_total_time_by_cuisine.columns = ['cuisine_type', 'average_total_time']

# Style the table using pandas Styler
styled_avg_total_time_by_cuisine = avg_total_time_by_cuisine.style.background_gradient(cmap='viridis').set_caption("Average Total Time by Cuisine Type")

# Display the styled table
styled_avg_total_time_by_cuisine

# Plotting the average total time by cuisine type
plt.figure(figsize=(12, 6))
sns.barplot(x='cuisine_type', y='average_total_time', data=avg_total_time_by_cuisine, palette='viridis')
plt.title('Average Total Time by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Average Total Time (minutes)')
plt.xticks(rotation=90)
plt.show()


# Pivot Table of Average Delivery Time by Weekday/Weekend and Cuisine Type
# Create a pivot table for average delivery time by Weekday/Weekend and cuisine type
pivot_table_delivery_time = data.pivot_table(values='delivery_time', index='day_of_the_week', columns='cuisine_type', aggfunc='mean')

# Visualize the pivot table as a heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(pivot_table_delivery_time, annot=True, cmap='Reds')
plt.title('Average Delivery Time by Weekday/Weekend and Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Weekday/Weekend')
plt.show()

pivot_table_delivery_time


# Calculate total time (preparation time + delivery time)
data['total_time'] = data['food_preparation_time'] + data['delivery_time']

# Group by cuisine type and calculate average total time
avg_total_time_by_cuisine = data.groupby('cuisine_type')['total_time'].mean().reset_index()
avg_total_time_by_cuisine.columns = ['cuisine_type', 'average_total_time']

# Style the table using pandas Styler
styled_avg_total_time_by_cuisine = avg_total_time_by_cuisine.style.background_gradient(cmap='viridis').set_caption("Average Total Time by Cuisine Type")

# Display the styled table
styled_avg_total_time_by_cuisine

# Create a pivot table for average preparation time by Weekday/Weekend and cuisine type
pivot_table_preparation_time = data.pivot_table(values='food_preparation_time', index='day_of_the_week', columns='cuisine_type', aggfunc='mean')

# Visualize the pivot table as a heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(pivot_table_preparation_time, annot=True, cmap='Blues')
plt.title('Average Preparation Time by Weekday/Weekend and Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Weekday/Weekend')
plt.show()

pivot_table_preparation_time


# Calculate total time (preparation time + delivery time)
data['total_time'] = data['food_preparation_time'] + data['delivery_time']

# Create a pivot table for average total time by Weekday/Weekend and cuisine type
pivot_table_total_time = data.pivot_table(values='total_time', index='day_of_the_week', columns='cuisine_type', aggfunc='mean')

# Visualize the pivot table as a heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(pivot_table_total_time, annot=True, cmap='Greens')
plt.title('Average Total Time by Weekday/Weekend and Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Weekday/Weekend')
plt.show()

pivot_table_total_time


# Calculate total time (preparation time + delivery time)
data['total_time'] = data['food_preparation_time'] + data['delivery_time']

# Calculate order frequency for each customer
order_frequency = data['customer_id'].value_counts().reset_index()
order_frequency.columns = ['customer_id', 'order_count']

# Merge the order frequency with the original dataset
merged_data = pd.merge(data, order_frequency, on='customer_id')

# Calculate the average preparation, delivery, and total time for different order frequencies
avg_times_by_frequency = merged_data.groupby('order_count').agg({
    'food_preparation_time': 'mean',
    'delivery_time': 'mean',
    'total_time': 'mean'
}).reset_index()
avg_times_by_frequency.columns = ['order_count', 'average_preparation_time', 'average_delivery_time', 'average_total_time']

# Plotting the average preparation, delivery, and total time by order frequency side by side
fig, ax = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

# Preparation Time
sns.lineplot(ax=ax[0], x='order_count', y='average_preparation_time', data=avg_times_by_frequency, marker='o', color='skyblue', label='Preparation Time')
ax[0].set_title('Average Preparation Time by Order Frequency')
#ax[0].set_xlabel('Order Frequency')
ax[0].set_ylabel('Average Time (minutes)')
ax[0].grid(True)
ax[0].legend()

# Delivery Time
sns.lineplot(ax=ax[1], x='order_count', y='average_delivery_time', data=avg_times_by_frequency, marker='o', color='lightgreen', label='Delivery Time')
ax[1].set_title('Average Delivery Time by Order Frequency')
ax[1].set_xlabel('Order Frequency')
ax[1].grid(True)
ax[1].legend()

# Total Time
sns.lineplot(ax=ax[2], x='order_count', y='average_total_time', data=avg_times_by_frequency, marker='o', color='salmon', label='Total Time')
ax[2].set_title('Average Total Time by Order Frequency')
#ax[2].set_xlabel('Order Frequency')
ax[2].grid(True)
ax[2].legend()

plt.tight_layout()
plt.show()


# Calculate the order frequency by day of the week
order_frequency_by_day = data['day_of_the_week'].value_counts().reset_index()
order_frequency_by_day.columns = ['day_of_the_week', 'order_count']

# Style the table using pandas Styler
styled_order_frequency_by_day = order_frequency_by_day.style.background_gradient(cmap='viridis').set_caption("Order Frequency by Day of the Week")

# Display the styled table
styled_order_frequency_by_day

# Plotting the order frequency by day of the week
plt.figure(figsize=(10, 6))
sns.barplot(x='day_of_the_week', y='order_count', data=order_frequency_by_day, palette='viridis')
plt.title('Order Frequency by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Order Count')
#plt.xticks(rotation=90)
plt.grid(True)
plt.show()


# Convert 'rating' to numeric, replacing 'Not given' with NaN
data['rating'] = pd.to_numeric(data['rating'].replace('Not given', pd.NA), errors='coerce')

# Calculate the total wait time
data['total_wait_time'] = data['food_preparation_time'] + data['delivery_time']

# Identify numeric columns
numeric_columns = data.select_dtypes(include=[np.number]).columns
numeric_columns = numeric_columns.drop('customer_id')   # Exclude 'customer_id' from the analysis
numeric_columns = numeric_columns.drop('order_id')      # Exclude 'order_id' from the analysis

# Determine the number of rows and columns for the subplot grid
num_cols = 5
num_rows = math.ceil(len(numeric_columns) / num_cols)

# Set up the subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, num_rows * 5))
colors = sns.color_palette("husl", len(numeric_columns))  # Generate a color palette

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Step 2 & 3: Visualize Outliers and Identify Outliers using IQR method for each numeric column
outlier_counts = {}

for i, col in enumerate(numeric_columns):
    sns.boxplot(x=data[col], ax=axes[i], color=colors[i])
    axes[i].set_title(f'Box Plot of {col}')
    axes[i].set_xlabel(col)

    # Calculate IQR
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1

    # Define outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    outlier_counts[col] = len(outliers)

    # Print outliers count
    print(f"Number of outliers detected in {col}: {len(outliers)}")

# Remove any empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()
plt.show()

# Print summary of outliers in each column
print("\nSummary of Outlier Counts:")
for col, count in outlier_counts.items():
    print(f"{col}: {count} outliers")

# Optional: Remove outliers (across all numeric columns)
for col in numeric_columns:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Remove outliers
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

# Summary of the cleaned data
print(f"\nData after removing outliers: {len(data)} records remaining.")

Number of outliers detected in cost_of_the_order: 0
Number of outliers detected in rating: 0
Number of outliers detected in food_preparation_time: 0
Number of outliers detected in delivery_time: 0
Number of outliers detected in total_wait_time: 0

Summary of Outlier Counts:
cost_of_the_order: 0 outliers
rating: 0 outliers
food_preparation_time: 0 outliers
delivery_time: 0 outliers
total_wait_time: 0 outliers

Data after removing outliers: 1162 records remaining.


# Filter out rows where rating is 'Not given'
filtered_data = data[data['rating'] != 'Not given']
filtered_data['rating'] = filtered_data['rating'].astype(float)

# Calculate the rating count and average rating for each restaurant
restaurant_ratings = filtered_data.groupby('restaurant_name')['rating'].agg(['count', 'mean']).reset_index()
restaurant_ratings.columns = ['restaurant_name', 'rating_count', 'average_rating']

# Filter the restaurants that meet the criteria: rating count > 50 and average rating > 4
eligible_restaurants = restaurant_ratings[(restaurant_ratings['rating_count'] > 50) & (restaurant_ratings['average_rating'] > 4)]

# Style the table using pandas Styler with all column headers centered and data left-justified, except for rating_count data which is centered
styled_eligible_restaurants = (eligible_restaurants.style
                               .set_properties(subset=['restaurant_name'], **{'text-align': 'left'})
                               .set_properties(subset=['rating_count'], **{'text-align': 'center'})
                               .set_table_styles([{
                                   'selector': 'th',
                                   'props': [('text-align', 'center')]
                               }])
                               .set_caption("Restaurants Eligible for Promotional Offer"))

# Display the styled table
styled_eligible_restaurants


# Define the function to calculate the commission based on order cost
def calculate_commission(cost):
    if cost > 20:
        return cost * 0.25
    elif cost > 5:
        return cost * 0.15
    else:
        return 0

# Apply the function to calculate the commission for each order
data['commission'] = data['cost_of_the_order'].apply(calculate_commission)

# Calculate the total revenue generated by the company
total_revenue = data['commission'].sum()

# Print the total revenue generated by the company with a formatted string, rounded to 2 decimal places
print(f"The total net revenue generated by the company across all orders, based on the given commission structure, is ${total_revenue:.2f}")

The total net revenue generated by the company across all orders, based on the given commission structure, is $6166.30


# Write the code here# Calculate total time (preparation time + delivery time)
data['total_time'] = data['food_preparation_time'] + data['delivery_time']

# Calculate the percentage of orders that take more than 60 minutes to get delivered
total_orders = len(data)
orders_over_60_minutes = len(data[data['total_time'] > 60])
percentage_over_60_minutes = (orders_over_60_minutes / total_orders) * 100

# Print the percentage of orders that take more than 60 minutes
print(f"The percentage of orders that take more than 60 minutes to get delivered is {percentage_over_60_minutes:.2f}%")

The percentage of orders that take more than 60 minutes to get delivered is 10.54%


# Write the code here# Calculate the mean delivery time for weekdays and weekends
mean_delivery_time_by_day = data.groupby('day_of_the_week')['delivery_time'].mean().reset_index()

# Separate weekdays and weekends
weekdays = mean_delivery_time_by_day[mean_delivery_time_by_day['day_of_the_week'] == 'Weekday']['delivery_time'].values[0]
weekends = mean_delivery_time_by_day[mean_delivery_time_by_day['day_of_the_week'] == 'Weekend']['delivery_time'].values[0]

# Print the mean delivery times
print(f"The mean delivery time on weekdays is {weekdays:.2f} minutes.")
print(f"The mean delivery time on weekends is {weekends:.2f} minutes.")

The mean delivery time on weekdays is 28.34 minutes.
The mean delivery time on weekends is 22.47 minutes.


# Convert notebook to html

import os
os.getcwd()
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Project Assessment - FoodHub/FDS_Project_LearnerNotebook_FullCode.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Project Assessment - FoodHub/FDS_Project_LearnerNotebook_FullCode.ipynb to html
[NbConvertApp] Writing 3452465 bytes to /content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Two - Statistics for Data Science/Project Assessment - FoodHub/FDS_Project_LearnerNotebook_FullCode.html

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
1893	1476701	292602	Chipotle Mexican Grill $1.99 Delivery	Mexican	22.31	Weekend	5	31	17
1894	1477421	397537	The Smile	American	12.18	Weekend	5	31	19
1895	1477819	35309	Blue Ribbon Sushi	Japanese	25.22	Weekday	Not given	31	24
1896	1477513	64151	Jack's Wife Freda	Mediterranean	12.18	Weekday	5	23	31
1897	1478056	120353	Blue Ribbon Sushi	Japanese	19.45	Weekend	Not given	28	24

	order_id	customer_id	cost_of_the_order	food_preparation_time	delivery_time
count	1898.000000	1898.000000	1898.000000	1898.000000	1898.000000
mean	1477495.500000	171168.478398	16.498851	27.371970	24.161749
std	548.049724	113698.139743	7.483812	4.632481	4.972637
min	1476547.000000	1311.000000	4.470000	20.000000	15.000000
25%	1477021.250000	77787.750000	12.080000	23.000000	20.000000
50%	1477495.500000	128600.000000	14.140000	27.000000	25.000000
75%	1477969.750000	270525.000000	22.297500	31.000000	28.000000
max	1478444.000000	405334.000000	35.410000	35.000000	33.000000

Restaurant Name	Number of Orders
Shake Shack	219
The Meatball Shop	132
Blue Ribbon Sushi	119
Blue Ribbon Fried Chicken	96
Parm	68

	cuisine_type	rating	Not_given_count
11	Spanish	4.833333	6
12	Thai	4.666667	10
3	Indian	4.540000	23
8	Mexican	4.416667	29
5	Japanese	4.373626	197
4	Italian	4.360465	126
1	Chinese	4.338346	82
10	Southern	4.307692	4
2	French	4.300000	8
0	American	4.298913	216
9	Middle Eastern	4.235294	15
7	Mediterranean	4.218750	14
6	Korean	4.111111	4
13	Vietnamese	4.000000	2

cuisine_type	American	Chinese	French	Indian	Italian	Japanese	Korean	Mediterranean	Mexican	Middle Eastern	Southern	Spanish	Thai	Vietnamese
day_of_the_week
Weekday	28.248521	28.826923	27.200000	27.625000	28.802198	28.133333	26.0	28.785714	28.250000	28.470588	29.500000	28.0	26.500000	27.666667
Weekend	22.542169	22.269939	24.615385	22.346939	22.705314	22.519403	20.0	21.312500	22.641509	21.750000	20.727273	23.0	22.266667	25.000000

cuisine_type	American	Chinese	French	Indian	Italian	Japanese	Korean	Mediterranean	Mexican	Middle Eastern	Southern	Spanish	Thai	Vietnamese
day_of_the_week
Weekday	27.408284	28.076923	27.200000	26.333333	27.417582	27.281481	23.000000	25.785714	25.625000	27.235294	26.166667	33.000000	25.250000	23.666667
Weekend	27.453012	27.331288	26.769231	27.489796	27.512077	27.602985	25.909091	27.531250	27.226415	26.375000	28.363636	26.363636	27.866667	27.250000

cuisine_type	American	Chinese	French	Indian	Italian	Japanese	Korean	Mediterranean	Mexican	Middle Eastern	Southern	Spanish	Thai	Vietnamese
day_of_the_week
Weekday	55.656805	56.903846	54.400000	53.958333	56.219780	55.414815	49.000000	54.571429	53.875000	55.705882	55.666667	61.000000	51.750000	51.333333
Weekend	49.995181	49.601227	51.384615	49.836735	50.217391	50.122388	45.909091	48.843750	49.867925	48.125000	49.090909	49.363636	50.133333	52.250000

	restaurant_name	rating_count	average_rating
20	Blue Ribbon Fried Chicken	64	4.328125
21	Blue Ribbon Sushi	73	4.219178
136	Shake Shack	133	4.278195
153	The Meatball Shop	84	4.511905

Top 3 Customers by Number of Orders
Customer ID	Number of Orders
52832	13
47440	10
83287	9

Project Foundations for Data Science: FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Observations:¶

Question 1: How many rows and columns are present in the data?¶

Observations:¶

Question 2: What are the datatypes of the different columns in the dataset? (The info() function can be used)¶

Observations:¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method¶

Observations:¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed?¶

Observations:¶

Question 5: How many orders are not rated?¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration)¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received?¶

Observations:¶

Question 8: Which is the most popular cuisine on weekends?¶

Observations:¶

Question 9: What percentage of the orders cost more than 20 dollars?¶

Observations:¶

Question 10: What is the mean order delivery time?¶

Observations:¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed¶

Observations:¶

Multivariate Analysis¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables)¶

Observations¶

Customer Behavior Analysis¶

Frequency of orders per customer¶

Observations¶

Average order value per customer¶

Observations¶

Percentage of repeat customers¶

Observations¶

Average rating and order cost for repeat customers vs one-time customers¶

Observations¶

Cost Analysis¶

Order Cost by Cuisine Type¶

Observations¶

Avgerage Cost by Day of the Week and Cuisine Type¶

Observations¶

Order Frequency by Weekday/Weekend and Cuisine Type¶

Observations¶

Order Cost by Preparation Time, Delivery Time and Total Time¶

Observations¶

Order Frequency by Cost¶

Observations¶

Day of the Week Analysis¶

Preparation Time, Delivery Time, Total Time by Day of the week¶

Observations¶

Order Frequency by Day of the Week¶

Observations¶

Ratings Analysis¶

Ratings by Cuisine Type¶

Observations¶

Rating Distribution by Order Cost¶

Observations¶

Rating Distribution by Total Wait Time (Prep Time + Delivery Time)¶

Observations¶

Average Raiting by Day of the Week (excludes 'Not given')¶

Observations¶

Order Frequency vs Ratings¶

Observations¶

Time Analysis¶

Delivery Time vs Food Prep Time¶

Observations:¶

Delivery Time by Cuisine Type¶

Observations¶

Preparation Time by Cuisine Type¶

Total Time by Cuisine Type¶

Observations¶

Average Delivery Time by Day of the Week and Cuisine Type¶

Observations¶