# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# read the data
df = pd.read_csv('foodhub_order.csv')
# returns the first 5 rows
df.head()

#printing shape of the dataframe
print(df.shape)

(1898, 9)

# Using info function to print summary of dataframe
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB
None

#printing summary table of null values
print(df.isnull().sum())

order_id                 0
customer_id              0
restaurant_name          0
cuisine_type             0
cost_of_the_order        0
day_of_the_week          0
rating                   0
food_preparation_time    0
delivery_time            0
dtype: int64

#describing only the food_preparation_time column
print(df['food_preparation_time'].describe())

count    1898.000000
mean       27.371970
std         4.632481
min        20.000000
25%        23.000000
50%        27.000000
75%        31.000000
max        35.000000
Name: food_preparation_time, dtype: float64

#summing the number of rows in the dataframe where rating is not given
total_count = ((df['rating']=='Not given').sum())

#calculating as a percentage
percent_count = round(100*((df['rating']=='Not given').mean()),2)

#printing answers in context
print(f"total orders with no rating given: {total_count}")
print(f"percentage of no rating orders: {percent_count}%")

total orders with no rating given: 736
percentage of no rating orders: 38.78%

#calculating total time from order placed to food delivered, for univariate analysis
df['total_time'] = df['food_preparation_time']+df['delivery_time']


#separating dataframe columns into numerical and object
#numerical columns (excluding order id, customer id) 
num_cols = ['cost_of_the_order','food_preparation_time',
            'delivery_time','total_time']

#describing numerical columns
print(df[num_cols].describe())

#object columns (excluding restaurant name)
obj_cols = ['cuisine_type','day_of_the_week','rating']

#looping numerical columns to show histogram distribution
print('\033[1mUnivariate analysis of numerical columns:\033[0m\n')
for x in num_cols:
    #setting number of bins to 10 after trial and error
    df.hist(x,bins=5)
    plt.show()

#looping object (categorical) columns to show countplot distribution
print('\033[1m\nUnivariate analysis of object (categorical) columns:\033[0m\n')
for x in obj_cols:
    #showing countplot of categories
    sns.countplot(df[x])
    plt.show()
    
    #printing table of values
    print(df[x].value_counts())

       cost_of_the_order  food_preparation_time  delivery_time   total_time
count        1898.000000            1898.000000    1898.000000  1898.000000
mean           16.498851              27.371970      24.161749    51.533720
std             7.483812               4.632481       4.972637     6.833603
min             4.470000              20.000000      15.000000    35.000000
25%            12.080000              23.000000      20.000000    47.000000
50%            14.140000              27.000000      25.000000    52.000000
75%            22.297500              31.000000      28.000000    56.000000
max            35.410000              35.000000      33.000000    68.000000
Univariate analysis of numerical columns:


Univariate analysis of object (categorical) columns:

cuisine_type
American          584
Japanese          470
Italian           298
Chinese           215
Mexican            77
Indian             73
Middle Eastern     49
Mediterranean      46
Thai               19
French             18
Southern           17
Korean             13
Spanish            12
Vietnamese          7
Name: count, dtype: int64

day_of_the_week
Weekend    1351
Weekday     547
Name: count, dtype: int64

rating
Not given    736
5            588
4            386
3            188
Name: count, dtype: int64

#Getting the value counts for the top 5 restaurants, using value_counts and head functions
restaurant_counts = df['restaurant_name'].value_counts().head()

#printing the restaurant counts 
print(restaurant_counts)

restaurant_name
Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
Name: count, dtype: int64

#Filtering the dataset for weekend entries, getting the cuisine type column, counting values
cuisine_wknd_counts = df[df['day_of_the_week'] == 'Weekend']['cuisine_type'].value_counts()

#getting most popular cuisine type on weekends using .head(1)
cuisine_wknd_counts.head(1)

cuisine_type
American    415
Name: count, dtype: int64

#Calculating percentage of orders greater than 20 dollars 
percent_gt_20dollars = round(100*(df['cost_of_the_order'] > 20).mean(),2)

#Printing answer in context
print(f"Percentage total of order greater than 20 dollars: {percent_gt_20dollars}%")

Percentage total of order greater than 20 dollars: 29.24%

#Calculating mean delivery time
mean_delivery_time = round(df['delivery_time'].mean(),2)

#Printing answer in context
print(f"mean delivery time: {mean_delivery_time} minutes")

mean delivery time: 24.16 minutes

#Getting top 3 customers using value counts, taking top 3 using .head(3)
customer_counts = df['customer_id'].value_counts().head(3)

#printing answer
print(customer_counts)

customer_id
52832    13
47440    10
83287     9
Name: count, dtype: int64

#Showing pairplot of numerical variables in dataframe
sns.pairplot(df[num_cols])
plt.show()

# showing correlation matrix
corr_matrix = df[num_cols].corr().round(2)
print(corr_matrix)

#describing numerical columns of whole dataset for reference
print(df[num_cols].describe())

#using for loops to show boxplots of numerical variables by categorical variables 
for cat in obj_cols: #categorical variable
    for num_var in num_cols: #numerical variable
        
        #printing variables to keep track of plots
        print(f'\n\033[1mCategory Column: {cat}\nNumerical Column: {num_var}\033[0m')
        
        #creating boxplot of cat and num variables 
        sns.boxplot(x=cat,y=num_var,data=df)
        #setting xtick rotation to 45 degrees for legibility
        plt.xticks(rotation=45, ha='right')
        plt.show()

        #printing summary statistics of numerical variables by categorical variables
        stats = df.groupby(cat)[num_var].describe().round(2)
        print(f'\n\033[1mStatistics table of {num_var} against {cat}:\033[0m\n')
        print(stats)

#using crosstab to show relationship between categorical variables
i = 0
while i < len(obj_cols):
    j = 0  # reset inner loop for outer loop iterations
    while j < len(obj_cols):
        if i != j:  # avoid same column comparison
            contingency_table = pd.crosstab(df[obj_cols[i]], df[obj_cols[j]])
            
            # plotting heatmap, setting heatmap formatting to decimal (avoid scientific notation)
            sns.heatmap(contingency_table, annot=True, cmap='Oranges',fmt='d') 
            plt.title(f'Contingency Table: {obj_cols[i]} vs {obj_cols[j]}')
            plt.show()
        j += 1
    i += 1

                       cost_of_the_order  food_preparation_time  \
cost_of_the_order                   1.00                   0.04   
food_preparation_time               0.04                   1.00   
delivery_time                      -0.03                   0.01   
total_time                          0.01                   0.69   

                       delivery_time  total_time  
cost_of_the_order              -0.03        0.01  
food_preparation_time           0.01        0.69  
delivery_time                   1.00        0.74  
total_time                      0.74        1.00  
       cost_of_the_order  food_preparation_time  delivery_time   total_time
count        1898.000000            1898.000000    1898.000000  1898.000000
mean           16.498851              27.371970      24.161749    51.533720
std             7.483812               4.632481       4.972637     6.833603
min             4.470000              20.000000      15.000000    35.000000
25%            12.080000              23.000000      20.000000    47.000000
50%            14.140000              27.000000      25.000000    52.000000
75%            22.297500              31.000000      28.000000    56.000000
max            35.410000              35.000000      33.000000    68.000000

Category Column: cuisine_type
Numerical Column: cost_of_the_order

Statistics table of cost_of_the_order against cuisine_type:

                count   mean   std    min    25%    50%    75%    max
cuisine_type                                                         
American        584.0  16.32  7.46   4.71  12.08  14.12  22.26  33.18
Chinese         215.0  16.31  7.63   4.75  12.10  14.12  21.88  34.19
French           18.0  19.79  6.51  11.98  13.05  20.47  23.66  29.25
Indian           73.0  16.92  7.90   5.34  11.64  16.15  24.20  33.03
Italian         298.0  16.42  7.27   5.05  12.08  14.48  22.26  33.03
Japanese        470.0  16.30  7.42   4.47  12.08  14.07  21.83  33.37
Korean           13.0  14.00  7.56   5.77  11.84  12.18  14.02  30.75
Mediterranean    46.0  15.47  7.60   5.67  12.09  14.00  18.78  35.41
Mexican          77.0  16.93  7.56   4.85  12.08  15.57  22.36  33.32
Middle Eastern   49.0  18.82  8.09   5.77  12.95  16.30  25.27  32.93
Southern         17.0  19.30  7.97   7.38  12.47  16.11  29.15  31.43
Spanish          12.0  18.99  6.66  12.13  12.92  16.52  24.49  29.10
Thai             19.0  19.21  7.58   6.69  12.71  19.35  24.20  32.93
Vietnamese        7.0  12.88  5.32   6.01  10.02  12.13  14.87  22.26

Category Column: cuisine_type
Numerical Column: food_preparation_time

Statistics table of food_preparation_time against cuisine_type:

                count   mean   std   min    25%   50%   75%   max
cuisine_type                                                     
American        584.0  27.44  4.66  20.0  23.00  27.0  31.0  35.0
Chinese         215.0  27.51  4.76  20.0  23.50  27.0  32.0  35.0
French           18.0  26.89  5.04  21.0  22.50  25.5  31.0  35.0
Indian           73.0  27.11  4.62  20.0  23.00  27.0  31.0  35.0
Italian         298.0  27.48  4.52  20.0  23.00  28.0  31.0  35.0
Japanese        470.0  27.51  4.60  20.0  24.00  27.5  32.0  35.0
Korean           13.0  25.46  3.97  20.0  23.00  25.0  26.0  33.0
Mediterranean    46.0  27.00  4.91  20.0  23.00  27.0  31.0  35.0
Mexican          77.0  26.73  4.60  20.0  23.00  26.0  31.0  35.0
Middle Eastern   49.0  26.67  4.01  20.0  24.00  27.0  30.0  34.0
Southern         17.0  27.59  5.52  20.0  24.00  26.0  33.0  35.0
Spanish          12.0  26.92  4.50  20.0  23.75  27.5  28.5  35.0
Thai             19.0  27.32  5.50  21.0  22.00  28.0  32.5  35.0
Vietnamese        7.0  25.71  5.15  20.0  21.50  25.0  29.5  33.0

Category Column: cuisine_type
Numerical Column: delivery_time

Statistics table of delivery_time against cuisine_type:

                count   mean   std   min    25%   50%    75%   max
cuisine_type                                                      
American        584.0  24.19  4.83  15.0  20.00  25.0  28.00  33.0
Chinese         215.0  23.86  4.97  15.0  20.00  24.0  28.00  33.0
French           18.0  25.33  3.74  17.0  23.25  27.0  28.75  29.0
Indian           73.0  24.08  5.05  15.0  20.00  25.0  29.00  32.0
Italian         298.0  24.57  5.18  15.0  20.00  26.0  28.75  33.0
Japanese        470.0  24.13  4.99  15.0  20.00  25.0  28.00  33.0
Korean           13.0  20.92  4.03  16.0  17.00  20.0  26.00  26.0
Mediterranean    46.0  23.59  5.42  15.0  19.00  24.0  28.75  33.0
Mexican          77.0  24.39  4.56  16.0  21.00  25.0  28.00  33.0
Middle Eastern   49.0  24.08  5.66  15.0  19.00  25.0  29.00  33.0
Southern         17.0  23.82  5.96  15.0  21.00  25.0  28.00  33.0
Spanish          12.0  23.42  4.81  17.0  19.75  23.0  28.25  30.0
Thai             19.0  23.16  4.99  15.0  19.50  24.0  26.50  32.0
Vietnamese        7.0  26.14  3.89  19.0  25.00  26.0  28.50  31.0

Category Column: cuisine_type
Numerical Column: total_time

Statistics table of total_time against cuisine_type:

                count   mean   std   min    25%   50%    75%   max
cuisine_type                                                      
American        584.0  51.63  6.62  35.0  47.00  51.0  56.00  68.0
Chinese         215.0  51.37  7.36  35.0  46.00  51.0  56.50  68.0
French           18.0  52.22  6.03  43.0  49.00  50.5  56.75  64.0
Indian           73.0  51.19  6.52  36.0  47.00  51.0  56.00  65.0
Italian         298.0  52.05  6.76  35.0  47.25  53.0  56.00  67.0
Japanese        470.0  51.64  6.99  36.0  47.00  52.0  56.00  68.0
Korean           13.0  46.38  4.52  38.0  44.00  45.0  51.00  52.0
Mediterranean    46.0  50.59  7.21  36.0  46.25  50.0  55.00  64.0
Mexican          77.0  51.12  5.89  36.0  48.00  51.0  55.00  65.0
Middle Eastern   49.0  50.76  7.22  36.0  47.00  53.0  56.00  67.0
Southern         17.0  51.41  8.60  37.0  46.00  53.0  56.00  68.0
Spanish          12.0  50.33  8.06  38.0  45.25  49.0  57.25  64.0
Thai             19.0  50.47  7.15  37.0  47.00  49.0  57.00  61.0
Vietnamese        7.0  51.86  4.74  46.0  48.50  51.0  55.00  59.0

Category Column: day_of_the_week
Numerical Column: cost_of_the_order

Statistics table of cost_of_the_order against day_of_the_week:

                  count   mean   std   min    25%    50%    75%    max
day_of_the_week                                                       
Weekday           547.0  16.31  7.48  4.75  12.08  14.07  22.28  35.41
Weekend          1351.0  16.57  7.49  4.47  12.08  14.41  22.28  33.37

Category Column: day_of_the_week
Numerical Column: food_preparation_time

Statistics table of food_preparation_time against day_of_the_week:

                  count   mean   std   min   25%   50%   75%   max
day_of_the_week                                                   
Weekday           547.0  27.21  4.71  20.0  23.0  27.0  31.0  35.0
Weekend          1351.0  27.44  4.60  20.0  23.0  27.0  31.0  35.0

Category Column: day_of_the_week
Numerical Column: delivery_time

Statistics table of delivery_time against day_of_the_week:

                  count   mean   std   min   25%   50%   75%   max
day_of_the_week                                                   
Weekday           547.0  28.34  2.89  24.0  26.0  28.0  31.0  33.0
Weekend          1351.0  22.47  4.63  15.0  18.5  22.0  27.0  30.0

Category Column: day_of_the_week
Numerical Column: total_time

Statistics table of total_time against day_of_the_week:

                  count   mean   std   min   25%   50%   75%   max
day_of_the_week                                                   
Weekday           547.0  55.55  5.57  44.0  51.0  55.0  59.5  68.0
Weekend          1351.0  49.91  6.62  35.0  45.0  50.0  55.0  65.0

Category Column: rating
Numerical Column: cost_of_the_order

Statistics table of cost_of_the_order against rating:

           count   mean   std   min    25%    50%    75%    max
rating                                                         
3          188.0  16.22  7.84  4.75  12.04  14.12  21.83  33.37
4          386.0  16.71  7.63  4.75  12.13  14.16  22.79  35.41
5          588.0  16.97  7.45  4.47  12.13  15.20  22.80  33.03
Not given  736.0  16.09  7.33  4.66  11.83  14.07  21.50  34.19

Category Column: rating
Numerical Column: food_preparation_time

#filtering dataframe to rows with ratings, as df_rated
df_rated = df[df['rating'] != 'Not given']

#setting rating column in df_rated to int type 
df_rated.loc[:, 'rating'] = df_rated['rating'].astype(int)

#pivoting restaurant name by rating count, average rating
df_rated_pivot = df_rated.pivot_table(index='restaurant_name', values='rating', 
                                      aggfunc=['count', 'mean'])
#setting column names in pivoted df
df_rated_pivot.columns = ['rating_count', 'average_rating']

#resetting index
df_rated_pivot.reset_index(inplace=True)

#filtering promo-eligible restaurants only
df_promo = df_rated_pivot[(df_rated_pivot['rating_count']>50) & (df_rated_pivot['average_rating']>4)]

#printing final list of promo-eligible restaurants
print(df_promo)

               restaurant_name  rating_count average_rating
16   Blue Ribbon Fried Chicken            64       4.328125
17           Blue Ribbon Sushi            73       4.219178
117                Shake Shack           133       4.278195
132          The Meatball Shop            84       4.511905

#calculating 25% revenue on orders greater than 20 dollars
orders_gt_20dollars_revenue = 0.25*df[df['cost_of_the_order']>20]['cost_of_the_order'].sum()

#calculating 15% revenue on orders greater than 5 dollars, and less than 20 dollars
orders_gt_5dollars_revenue = 0.15*df[(df['cost_of_the_order']>5) & (df['cost_of_the_order']<20)]['cost_of_the_order'].sum()

#summing and printing total revenue
total_revenue = round(orders_gt_5dollars_revenue+orders_gt_20dollars_revenue,2)
print(f"Total revenue from orders greater than 5 dollars and orders greater than 20 dollars: ${total_revenue}")

Total revenue from orders greater than 5 dollars and orders greater than 20 dollars: $6166.3

#calculating total time from order placed to food delivered
#previously calculated for univariate analysis
df['total_time'] = df['food_preparation_time']+df['delivery_time']

#calculating percent of deliveries greater than 60 minutes
deliveries_gt_60mins = percent_gt_20dollars = round(100*((df['total_time']>60).sum())/len(df['total_time']),2)

#printing result
print(f"percentage of orders greater than 60 mins from order placed: {deliveries_gt_60mins}%")

percentage of orders greater than 60 mins from order placed: 10.54%

#Calculating the mean delivery time on weekdays, rounded to 2 decimal points
Wkday_mean_delivery = round(df[df['day_of_the_week']=='Weekday']['delivery_time'].mean(),2)

#Calculating the mean delivery time on weekends, rounded to 2 decimal points
Wknd_mean_delivery = round(df[df['day_of_the_week']=='Weekend']['delivery_time'].mean(),2)

#Printing mean weekday and weekend delivery times in context
print(f"Mean delivery time on weekdays: {Wkday_mean_delivery} minutes")
print(f"Mean delivery time on weekends: {Wknd_mean_delivery} minutes")

#Calculating and printing difference in mean delivery times
print(f"Mean delivery time difference: {round(Wkday_mean_delivery-Wknd_mean_delivery,2)} minutes")

Mean delivery time on weekdays: 28.34 minutes
Mean delivery time on weekends: 22.47 minutes
Mean delivery time difference: 5.87 minutes

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Observations:¶

How many rows and columns are present in the data?¶

Observations:¶

What are the datatypes of the different columns in the dataset?¶

Observations:¶

Are there any missing values in the data?¶

Observations:¶

Check the statistical summary of the data¶

Observations:¶

How many orders are not rated?¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Explore all the variables and provide observations on their distributions¶

Observations:¶

Which are the top 5 restaurants in terms of the number of orders received?¶

Observations:¶

Which is the most popular cuisine on weekends?¶

Observations:¶

What percentage of the orders cost more than 20 dollars?¶

Observations:¶

What is the mean order delivery time?¶

Observations:¶

The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed.¶

Observations:¶

Multivariate Analysis¶

Perform a multivariate analysis to explore relationships between the important variables in the dataset¶

Observations:¶

Numerical-Numerical Variable Relationships:¶

Numerical-Categorical Variable Relationships:¶

Cuisine Type:¶

Day of the Week:¶

Rating:¶

Categorical-Categorical Variable Relationships:¶

Cuisine Type vs Day of the Week:¶

Cuisine Type vs Rating:¶

Rating vs Day of the Week:¶

Observations:¶

The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders¶

Observations:¶

The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed?¶

Observations:¶

The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends?¶

Observations:¶

Conclusions and Recommendations from the Analysis¶

Conclusions:¶

Recommendations:¶