# Installing the libraries with the specified version.
!pip install numpy==1.25.2 pandas==1.5.3 matplotlib==3.7.1 seaborn==0.13.1 -q --user

# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# uncomment and run the following lines for Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

data = pd.read_csv('foodhub_order.csv')

data.head()

data.shape

(1898, 9)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB

data.isnull().sum()

order_id                 0
customer_id              0
restaurant_name          0
cuisine_type             0
cost_of_the_order        0
day_of_the_week          0
rating                   0
food_preparation_time    0
delivery_time            0
dtype: int64

data.duplicated().sum()

0

data.describe().T

print(data['rating'].value_counts())

Not given    736
5            588
4            386
3            188
Name: rating, dtype: int64

#observation on cost_of_the_order
sns.displot(data=data,x='cost_of_the_order',kind='kde')
plt.show()
sns.boxplot(data=data,x='cost_of_the_order')
plt.show()

#observation on food_preparation_time
sns.displot(data=data,x='food_preparation_time',kind='kde')
plt.show()
sns.boxplot(data=data,x='food_preparation_time')
plt.show()

#observation on delivery_time
sns.displot(data=data,x='delivery_time',kind='kde')
plt.show()
sns.boxplot(data=data,x='delivery_time')
plt.show()

#observation of the cusine_type
sns.countplot(data=data,x='cuisine_type')
plt.xticks(rotation=90)
plt.show()

print(data['cuisine_type'].value_counts())

American          584
Japanese          470
Italian           298
Chinese           215
Mexican            77
Indian             73
Middle Eastern     49
Mediterranean      46
Thai               19
French             18
Southern           17
Korean             13
Spanish            12
Vietnamese          7
Name: cuisine_type, dtype: int64

#observation of the day_of_the_week
sns.countplot(data=data,x='day_of_the_week')
plt.xticks(rotation=90)
plt.show()

print(data['day_of_the_week'].value_counts())

Weekend    1351
Weekday     547
Name: day_of_the_week, dtype: int64

#observation of restaurant_name
plt.figure(figsize=(15,5))
sns.countplot(data=data,x='restaurant_name')
plt.xticks(rotation=90)
plt.show()

C:\Users\bruce\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 140 (\x8c) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\Users\bruce\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 142 (\x8e) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

print(data['restaurant_name'].value_counts())

Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
                            ... 
Sushi Choshi                   1
Dos Caminos Soho               1
La Follia                      1
Philippe Chow                  1
'wichcraft                     1
Name: restaurant_name, Length: 178, dtype: int64

#observation of rating
sns.countplot(data=data,x='rating')
plt.xticks(rotation=90)
plt.show()

print(data['rating'].value_counts())

Not given    736
5            588
4            386
3            188
Name: rating, dtype: int64

data['restaurant_name'].value_counts().head(n=5)

Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
Name: restaurant_name, dtype: int64

sns.countplot(data=data.loc[data['day_of_the_week']=="Weekend"],x='cuisine_type')
plt.xticks(rotation=90)
plt.show()

weekend_cuisines = data.loc[data['day_of_the_week']=="Weekend",'cuisine_type']
print(weekend_cuisines.value_counts(dropna=False))
weekend_cuisines.count()

American          415
Japanese          335
Italian           207
Chinese           163
Mexican            53
Indian             49
Mediterranean      32
Middle Eastern     32
Thai               15
French             13
Korean             11
Southern           11
Spanish            11
Vietnamese          4
Name: cuisine_type, dtype: int64

1351

order_count_greater_than_20 = data.loc[data['cost_of_the_order']>20].shape[0]
format(order_count_greater_than_20/data['cost_of_the_order'].count(), ".0%")

'29%'

data['delivery_time'].mean()

24.161749209694417

data['customer_id'].value_counts().head(3)

52832    13
47440    10
83287     9
Name: customer_id, dtype: int64

# plt.figure(figsize=(10,5))
data_without_id_columns= data.drop(columns=["order_id", "customer_id"])
sns.heatmap(data_without_id_columns.corr(),annot=True,cmap='Spectral',vmin=-1,vmax=1)
plt.show()

C:\Users\bruce\AppData\Local\Temp\ipykernel_23616\4285889247.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(data_without_id_columns.corr(),annot=True,cmap='Spectral',vmin=-1,vmax=1)

sns.pairplot(data, diag_kind="kde");

plt.figure(figsize=(10,5))
sns.scatterplot(data=data,x='food_preparation_time',y='delivery_time')
plt.show()

sns.boxplot(data=data,x='cuisine_type',y='cost_of_the_order')
plt.xticks(rotation=90)
plt.show()

# Dispersion of price in every region
sns.catplot(x='cost_of_the_order',
            col='cuisine_type', 
            data=data,
            col_wrap=4,
            kind="violin")
plt.show()

sns.boxplot(data=data,x='cuisine_type',y='food_preparation_time')
plt.xticks(rotation=90)
plt.show()

sns.boxplot(data=data,x='cuisine_type',y='delivery_time')
plt.xticks(rotation=90)
plt.show()

sns.countplot(data=data,x='cuisine_type',hue='day_of_the_week')
plt.xticks(rotation=90)
plt.show()

sns.scatterplot(data=data, x="cuisine_type", y="cost_of_the_order", hue="day_of_the_week")
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(25, 7)) # To resize the plot
sns.stripplot(data = data, x = "cuisine_type", y = "cost_of_the_order", hue="day_of_the_week"
            )
plt.xticks(rotation=90);

# remove the orders that were 'Not Rated'
rated_data = data.loc[data['rating'] != 'Not given'].reset_index()
# convert the ratings to an int
rated_data['order_rating'] = rated_data.loc[:,'rating'].astype(int)
rated_data.head()

# create a variable that contains the restraurants and their rating counts
restaurant_rating_count = rated_data.groupby(['restaurant_name'])['order_rating'].count().reset_index()
# rename the rating column to rating_counts and sort them in descending order
restaurant_rating_count.rename(columns={'order_rating':'rating_counts'},inplace=True)
# restaurant_rating_count.sort_values(ascending=False, by= 'rating_counts')
# get list of restaurant with rating greater than 50
restaurant_rating_50 = restaurant_rating_count[restaurant_rating_count['rating_counts']>50]
restaurant_rating_50

restaurant_average_rating = rated_data.groupby(['restaurant_name'])[['order_rating']].mean().reset_index()
restaurant_average_rating.rename(columns={'order_rating':'avg_rating'},inplace=True)
# get the list of restraurants who have a rating greater than 4
rating_greater_4 = restaurant_average_rating[restaurant_average_rating['avg_rating']>4]
rating_greater_4

promo_restaurants = restaurant_rating_50.merge(rating_greater_4, on = 'restaurant_name', how = 'inner')
promo_restaurants

def calculate_revenue(cost_of_the_order):
    if cost_of_the_order > 20:
        return cost_of_the_order * .25
    elif cost_of_the_order > 5:
        return cost_of_the_order * .15
    else:
        return 0

# test the function, numbers should match
print(20*.15, calculate_revenue(20))
print(22*.25, calculate_revenue(22))
print(5*0, calculate_revenue(5))

3.0 3.0
5.5 5.5
0 0

data['revenue'] = data['cost_of_the_order'].map(lambda x: calculate_revenue(x))
# sanity check the revenue column with the cost_of_order in the first two rows
print(30.75*.25, calculate_revenue(30.75))
print(12.08*.15, calculate_revenue(12.08))
data

7.6875 7.6875
1.8119999999999998 1.8119999999999998

data['revenue'].sum()

6166.303

data['revenue'].mean().round(2)

3.25

data['total_time'] = data.apply(lambda x: x['food_preparation_time'] + x['delivery_time'] , axis=1)
# sanity check the result
data.loc[:, ['food_preparation_time', 'delivery_time', 'total_time']]

orders_greaterthan_60 = data[data.total_time > 60].shape[0]
np.round(orders_greaterthan_60/data.shape[0], 2)

0.11

plt.figure(figsize=(10,5))
data_without_id_columns= data.drop(columns=["order_id", "customer_id"])
sns.heatmap(data_without_id_columns.corr(),annot=True,cmap='Spectral',vmin=-1,vmax=1)
plt.show()

C:\Users\bruce\AppData\Local\Temp\ipykernel_23616\2183855587.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(data_without_id_columns.corr(),annot=True,cmap='Spectral',vmin=-1,vmax=1)

data.loc[:,'delivery_time'].mean()

24.161749209694417

data.groupby(['day_of_the_week'])[['delivery_time']].mean()

sns.boxplot(data=data,x='day_of_the_week',y='delivery_time')
plt.xticks(rotation=90)
plt.show()

	count	mean	std	min	25%	50%	75%	max
order_id	1898.0	1.477496e+06	548.049724	1476547.00	1477021.25	1477495.50	1.477970e+06	1478444.00
customer_id	1898.0	1.711685e+05	113698.139743	1311.00	77787.75	128600.00	2.705250e+05	405334.00
cost_of_the_order	1898.0	1.649885e+01	7.483812	4.47	12.08	14.14	2.229750e+01	35.41
food_preparation_time	1898.0	2.737197e+01	4.632481	20.00	23.00	27.00	3.100000e+01	35.00
delivery_time	1898.0	2.416175e+01	4.972637	15.00	20.00	25.00	2.800000e+01	33.00

	restaurant_name	avg_rating
0	'wichcraft	5.000000
1	12 Chairs	4.500000
3	67 Burger	5.000000
4	Amma	4.500000
6	Anjappar Chettinad	5.000000
...	...	...
149	Yama 49	5.000000
150	Yama Japanese Restaurant	4.500000
153	da Umberto	5.000000
154	ilili Restaurant	4.153846
155	indikitch	4.500000

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

	restaurant_name	rating_counts
16	Blue Ribbon Fried Chicken	64
17	Blue Ribbon Sushi	73
117	Shake Shack	133
132	The Meatball Shop	84

	restaurant_name	rating_counts	avg_rating
0	Blue Ribbon Fried Chicken	64	4.328125
1	Blue Ribbon Sushi	73	4.219178
2	Shake Shack	133	4.278195
3	The Meatball Shop	84	4.511905

	food_preparation_time	delivery_time	total_time
0	25	20	45
1	25	23	48
2	23	28	51
3	25	15	40
4	25	24	49
...	...	...	...
1893	31	17	48
1894	31	19	50
1895	31	24	55
1896	23	31	54
1897	28	24	52

Project Python Foundations: FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Question 1: How many rows and columns are present in the data? [0.5 mark]¶

Observations:¶

Question 2: What are the datatypes of the different columns in the dataset? (The info() function can be used) [0.5 mark]¶

Observations:¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method. [1 mark]¶

Observations:¶

Check for duplicate data¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed? [2 marks]¶

Observations:¶

Question 5: How many orders are not rated? [1 mark]¶

Observation¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.) [9 marks]¶

Observation of restaurants¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received? [1 mark]¶

Observations:¶

Question 8: Which is the most popular cuisine on weekends? [1 mark]¶

Observations:¶

Question 9: What percentage of the orders cost more than 20 dollars? [2 marks]¶

Observations:¶

Question 10: What is the mean order delivery time? [1 mark]¶

Observations:¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed. [1 mark]¶

Observations:¶

Multivariate Analysis¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables) [10 marks]¶

Cuisine¶

Observations:¶

Question 14: The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders. [3 marks]¶

Observations:¶

Question 15: The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.) [2 marks]¶

Observations:¶

Question 16: The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends? [2 marks]¶

Observations:¶

Conclusion and Recommendations¶

Question 17: What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.) [6 marks]¶

Conclusions:¶

Recommendations:¶