#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

#preprocessing
from sklearn.preprocessing import LabelEncoder

#model preparation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

#models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

#metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

#file path
file_path = r'recipe_site_traffic_2212_synthesised.csv'

#read CSV file into a DataFrame
df = pd.read_csv(file_path)

#display dataframe
df.head()

#show dataframe information
print(df.describe())
print(df.info())
print(df.columns)
print(df.shape)

           recipe     calories  carbohydrate       sugar     protein
count  947.000000   895.000000    895.000000  895.000000  895.000000
mean   474.000000   444.657979     35.771069    9.227478   24.632282
std    273.519652   462.081417     44.828013   14.972759   37.097133
min      1.000000     0.142800      0.030600    0.010200    0.000000
25%    237.500000   112.638600      8.542500    1.723800    3.258900
50%    474.000000   294.321000     21.909600    4.641000   11.016000
75%    710.500000   609.603000     45.864300    9.996000   30.804000
max    947.000000  3705.823200    541.028400  151.725000  370.627200
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 947 entries, 0 to 946
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   recipe        947 non-null    int64  
 1   calories      895 non-null    float64
 2   carbohydrate  895 non-null    float64
 3   sugar         895 non-null    float64
 4   protein       895 non-null    float64
 5   category      947 non-null    object 
 6   servings      947 non-null    object 
 7   high_traffic  574 non-null    object 
dtypes: float64(4), int64(1), object(3)
memory usage: 59.3+ KB
None
Index(['recipe', 'calories', 'carbohydrate', 'sugar', 'protein', 'category',
       'servings', 'high_traffic'],
      dtype='object')
(947, 8)

#defining custom function to print all unique values and their counts from each column
def print_uniques(df):
    #for large datasets - using generator for speed
    uniques_generator = ((x, df[x].unique(), df[x].nunique()) for x in df.columns)
    
    print('\nUnique Values:')
    for x, unique_values, num_unique in uniques_generator:
        print(f"{x}: \n {unique_values} \n ({num_unique} unique values)")

#printing uniques of selected columns
print_uniques(df[['category', 'servings','high_traffic']])

Unique Values:
category: 
 ['Potato' 'Breakfast' 'Beverages' 'One Dish Meal' 'Chicken Breast'
 'Lunch/Snacks' 'Chicken' 'Vegetable' 'Meat' 'Dessert' 'Pork'] 
 (11 unique values)
servings: 
 ['6' '4' '1' '2' '4 as a snack' '6 as a snack'] 
 (6 unique values)
high_traffic: 
 ['High' nan] 
 (1 unique values)

#fill missing values in high_traffic column with 'Low'
df['high_traffic'] = df['high_traffic'].fillna('Low')

#sort df by 'calories' column
df_sorted = df.sort_values('calories')

#create matrix of missing values
msno.matrix(df_sorted)
plt.show()

#using msno to visualise only missing values
msno.bar(df[df.isna().any(axis=1)])
plt.show()

def keep_first_character(value):
    return int(str(value)[0])

#apply function to all columns in df
df['serving_count'] = df['servings'].apply(keep_first_character)
print(df['serving_count'].unique()) #print unique values to verify correction

[6 4 1 2]

#define function to calculate per serving values
def calculate_per_serving(df, columns):
    # looping through columns
    for column in columns:
        df[f'{column}_per_serving'] = df[column] / df['serving_count']

#defining nutritional columns
nutritional_columns = ['calories', 'protein', 'sugar', 'carbohydrate']

#calculate per serving values
calculate_per_serving(df, nutritional_columns)

#apply median per_serving value to the missing values
for category in df['category'].unique():
    for column in nutritional_columns:
        fill_value = df.loc[df['category'] == category, f'{column}_per_serving'].median()
        print(f"Median {column} for {category}: {(fill_value).round(2)}") #values rounded for display
        df.loc[(df['category'] == category) & (df[f'{column}_per_serving'].isnull()), f'{column}_per_serving'] = fill_value

#multiply per serving column by serving count to fill missing values
for column in nutritional_columns:
    df[column] = df[f'{column}_per_serving'] * df['serving_count']

#print dataframe
print(df.describe())
print(df.info())

Median calories for Potato: 146.9
Median protein for Potato: 9.33
Median sugar for Potato: 1.68
Median carbohydrate for Potato: 5.94
Median calories for Breakfast: 97.09
Median protein for Breakfast: 1.65
Median sugar for Breakfast: 0.86
Median carbohydrate for Breakfast: 9.83
Median calories for Beverages: 66.14
Median protein for Beverages: 4.32
Median sugar for Beverages: 1.8
Median carbohydrate for Beverages: 9.81
Median calories for One Dish Meal: 42.06
Median protein for One Dish Meal: 0.15
Median sugar for One Dish Meal: 2.64
Median carbohydrate for One Dish Meal: 4.12
Median calories for Chicken Breast: 129.1
Median protein for Chicken Breast: 8.87
Median sugar for Chicken Breast: 1.54
Median carbohydrate for Chicken Breast: 11.85
Median calories for Lunch/Snacks: 110.36
Median protein for Lunch/Snacks: 10.89
Median sugar for Lunch/Snacks: 1.03
Median carbohydrate for Lunch/Snacks: 6.44
Median calories for Chicken: 124.82
Median protein for Chicken: 4.56
Median sugar for Chicken: 1.0
Median carbohydrate for Chicken: 8.43
Median calories for Vegetable: 149.81
Median protein for Vegetable: 8.56
Median sugar for Vegetable: 1.34
Median carbohydrate for Vegetable: 6.5
Median calories for Meat: 48.25
Median protein for Meat: 1.58
Median sugar for Meat: 1.15
Median carbohydrate for Meat: 4.48
Median calories for Dessert: 136.8
Median protein for Dessert: 7.71
Median sugar for Dessert: 0.94
Median carbohydrate for Dessert: 4.75
Median calories for Pork: 90.34
Median protein for Pork: 1.38
Median sugar for Pork: 7.06
Median carbohydrate for Pork: 12.44
           recipe     calories  carbohydrate       sugar     protein  \
count  947.000000   947.000000    947.000000  947.000000  947.000000   
mean   474.000000   445.713269     35.386538    9.103899   24.670247   
std    273.519652   453.169276     43.742029   14.664859   36.360540   
min      1.000000     0.142800      0.030600    0.010200    0.000000   
25%    237.500000   116.004600      9.027000    1.749300    3.350700   
50%    474.000000   301.532400     22.766400    4.641000   11.413800   
75%    710.500000   608.399400     45.512400    9.960300   31.706700   
max    947.000000  3705.823200    541.028400  151.725000  370.627200   

       serving_count  calories_per_serving  protein_per_serving  \
count     947.000000            947.000000           947.000000   
mean        3.477297            190.822369            10.412789   
std         1.732741            288.323651            18.803718   
min         1.000000              0.071400             0.000000   
25%         2.000000             36.468825             1.134750   
50%         4.000000             98.124000             3.580200   
75%         4.000000            217.484400            10.457550   
max         6.000000           2378.966400           186.282600   

       sugar_per_serving  carbohydrate_per_serving  
count         947.000000                947.000000  
mean            3.795421                 14.789974  
std             8.687339                 24.752000  
min             0.001700                  0.007650  
25%             0.609875                  2.736150  
50%             1.399950                  6.981900  
75%             3.623550                 15.927300  
max           151.725000                390.721200  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 947 entries, 0 to 946
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   recipe                    947 non-null    int64  
 1   calories                  947 non-null    float64
 2   carbohydrate              947 non-null    float64
 3   sugar                     947 non-null    float64
 4   protein                   947 non-null    float64
 5   category                  947 non-null    object 
 6   servings                  947 non-null    object 
 7   high_traffic              947 non-null    object 
 8   serving_count             947 non-null    int64  
 9   calories_per_serving      947 non-null    float64
 10  protein_per_serving       947 non-null    float64
 11  sugar_per_serving         947 non-null    float64
 12  carbohydrate_per_serving  947 non-null    float64
dtypes: float64(8), int64(2), object(3)
memory usage: 96.3+ KB
None

#plotting countplots for category
plt.figure(figsize=(10, 6))
sns.countplot(x='category', data=df, order=df['category'].value_counts().index)
plt.title('Category Counts')
plt.xticks(rotation=45)
plt.show()

#plotting countplots for category vs high_traffic
plt.figure(figsize=(10, 6))
sns.countplot(x='category', hue='high_traffic', data=df, palette=['green', 'red'], order=df['category'].value_counts().index)
plt.title('Category vs High Traffic')
plt.xticks(rotation=45)
plt.show()

#plotting crosstab of category and high_traffic
cat_traffic = pd.crosstab(df['category'], df['high_traffic'])
plt.figure(figsize=(10, 6))
sns.heatmap(cat_traffic, annot=True, cmap='Oranges', cbar=False)
plt.title('Category vs High Traffic')
plt.show()

#plotting countplots for serving size
plt.figure(figsize=(10, 6))
sns.countplot(x='servings', data=df, order=df['servings'].value_counts().index)
plt.title('Counts of Servings')
plt.xticks(rotation=45)
plt.show()

#plotting countplots for servings vs high_traffic
plt.figure(figsize=(10, 6))
sns.countplot(x='servings', hue='high_traffic', data=df, order=df['servings'].value_counts().index, palette=['green', 'red'])
plt.title('Servings vs High Traffic')
plt.show()

#plotting crosstab of category and high_traffic
serve_traffic = pd.crosstab(df['servings'], df['high_traffic'])
plt.figure(figsize=(10, 6))
sns.heatmap(serve_traffic, annot=True, cmap='Oranges',fmt='g', cbar=False)
plt.title('Servings vs High Traffic')
plt.show()

#plotting boxplots for nutritional columns vs high_traffic
for column in nutritional_columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='high_traffic', y=column, data=df)
    plt.title(f'{column.capitalize()} vs High Traffic')
    plt.show()

#plotting histograms of df columns
df[nutritional_columns].hist(figsize=(10, 10))

array([[<Axes: title={'center': 'calories'}>,
        <Axes: title={'center': 'protein'}>],
       [<Axes: title={'center': 'sugar'}>,
        <Axes: title={'center': 'carbohydrate'}>]], dtype=object)

#dropping unnecessary columns
drop_cols = ['calories_per_serving', 'protein_per_serving', 'sugar_per_serving', 'carbohydrate_per_serving']
df = df.drop(drop_cols,axis=1)

#creating copy of df for ml
df_ml = df.copy()

#applying log transformation to nutrition columns
nutrition_columns = ['calories','protein','sugar','carbohydrate']
for column in nutrition_columns:
    df_ml[column] = np.log1p(df_ml[column])
    #rename column
    df_ml.rename(columns={column: f'log1p_{column}'}, inplace=True)

#print head of dataframe
print(df_ml.head())

   recipe  log1p_calories  log1p_carbohydrate  log1p_sugar  log1p_protein  \
0       1        6.782631            3.601386     2.402620       4.043016   
1       2        3.616029            3.697125     0.514738       0.661863   
2       3        6.839011            3.796244     1.423542       1.370571   
3       4        4.604876            3.471072     3.698894       0.020195   
4       5        3.353092            1.060218     0.596636       0.432172   

        category servings high_traffic  serving_count  
0         Potato        6         High              6  
1      Breakfast        4         High              4  
2      Beverages        1          Low              1  
3  One Dish Meal        4         High              4  
4  One Dish Meal        4          Low              4

#plotting histograms of df columns
df_ml[['log1p_calories','log1p_carbohydrate','log1p_sugar','log1p_protein']].hist(figsize=(10, 10))

array([[<Axes: title={'center': 'log1p_calories'}>,
        <Axes: title={'center': 'log1p_carbohydrate'}>],
       [<Axes: title={'center': 'log1p_sugar'}>,
        <Axes: title={'center': 'log1p_protein'}>]], dtype=object)

#replace high_traffic column with 1 and 0
df_ml['high_traffic'] = df_ml['high_traffic'].replace({'High': 1, 'Low': 0})

#instantiate LabelEncoder
label_encoder = LabelEncoder()

#list of categorical columns to encode
cat_columns = ['category', 'servings']

#iterate through each column in the list
for column in cat_columns:
    #apply LabelEncoder to encode the column
    df_ml[column] = label_encoder.fit_transform(df_ml[column])

#drop columns
drop_cols = ['recipe','serving_count']
df_ml = df_ml.drop(drop_cols,axis=1)

#creating copy of the dataframe for classification
df_model = df_ml.copy()

#extract target variable
y = df_model['high_traffic']

#drop target variable from the dataframe
X = df_model.drop(['high_traffic'], axis=1)

#split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df_model['category'], test_size=0.2, random_state=42)

#instantiate Logistic Regression model
logistic_regression = LogisticRegression()

#define parameter grid for Logistic Regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300, 400, 500],
    'fit_intercept': [True, False],
    'class_weight': [None, 'balanced'],
    'warm_start': [True, False],
    'multi_class': ['auto', 'ovr', 'multinomial'],
    'random_state': [None, 42]
}

#printing model eval callout
print("Evaluating Logistic Regression:")

#instantiate randomised search CV
random_search_lr = RandomizedSearchCV(logistic_regression, param_distributions=param_grid, n_iter=20, 
                                      scoring='precision', n_jobs=-1, cv=5, random_state=42)

#fit randomised search CV
random_search_lr.fit(X_train, y_train)

#best model
best_model_lr = random_search_lr.best_estimator_

#best precision score
best_precision_lr = random_search_lr.best_score_

#best hyperparameters
best_params_lr = random_search_lr.best_params_

print("\nBest Precision:", best_precision_lr)
print("Best Parameters:", best_params_lr)

Evaluating Logistic Regression:

Best Precision: 0.652482324937568
Best Parameters: {'warm_start': True, 'solver': 'sag', 'random_state': None, 'penalty': 'l2', 'multi_class': 'ovr', 'max_iter': 400, 'fit_intercept': False, 'class_weight': 'balanced', 'C': 0.001}

c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning: 
50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 66, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1219, in fit
    multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 92, in _check_multi_class
    raise ValueError("Solver %s does not support a multinomial backend." % solver)
ValueError: Solver liblinear does not support a multinomial backend.

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 56, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 56, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 56, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1179, in fit
    raise ValueError("l1_ratio must be specified when penalty is elasticnet.")
ValueError: l1_ratio must be specified when penalty is elasticnet.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
c:\Users\jlenehan\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_search.py:979: UserWarning: One or more of the test scores are non-finite: [       nan        nan 0.6502291  0.63720833        nan        nan
 0.63703375 0.64145379 0.62731438        nan 0.65248232        nan
 0.65131258 0.65033181        nan        nan 0.6438239         nan
 0.                nan]
  warnings.warn(

#creating copy of the dataframe for classification
df_model = df_ml.copy()

#extract target variable
y = df_model['high_traffic']

#drop target variable from the dataframe
X = df_model.drop(['high_traffic'], axis=1)

#split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=df_model['category'], test_size=0.2, random_state=42)

#instantiate random forest classifier
rf_classifier = RandomForestClassifier()

#define parameter grid for random forest
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7],
    'min_samples_split': [2, 3, 4, 5, 6],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

#printing model eval callout
print("Evaluating Random Forest:")

#instantiate randomised search CV
random_search_rf = RandomizedSearchCV(rf_classifier, param_distributions=param_grid, n_iter=20, 
                                      scoring='precision', n_jobs=-1, cv=5, random_state=42)

#fit randomised search CV
random_search_rf.fit(X_train, y_train)

#best model
best_model_rf = random_search_rf.best_estimator_

#best precision score
best_precision_rf = random_search_rf.best_score_

#best hyperparameters
best_params_rf = random_search_rf.best_params_

#printing model metrics
print("\nBest Precision:", best_precision_rf)
print("Best Parameters:", best_params_rf)

Evaluating Random Forest:

Best Precision: 0.7532597853625702
Best Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 7, 'criterion': 'gini', 'bootstrap': False}

#calculating chance of randomly picking high traffic recipe
random_choice = df[df['high_traffic'] == 'High'].count()['high_traffic'] / len(df['high_traffic'])
print(f"Possibility of choosing high traffic recipe at random: {(100*random_choice).round(2)}%")

#displaying final metrics for logistic regression
print("\nLogistic Regression - Metrics:")
print(f"Final precision: {(100*best_precision_lr).round(3)}%")
print(f"Percentage improvement over random choice: {(100*(best_precision_lr/random_choice)-100).round(3)}%")

#displaying final metrics for random forest
print("\nRandom Forest - Metrics:")
print(f"Final precision: {(100*best_precision_rf).round(3)}%")
print(f"Percentage improvement over random choice: {(100*(best_precision_rf/random_choice)-100).round(3)}%")

Possibility of choosing high traffic recipe at random: 60.61%

Logistic Regression - Metrics:
Final precision: 65.248%
Percentage improvement over random choice: 7.648%

Random Forest - Metrics:
Final precision: 75.326%
Percentage improvement over random choice: 24.275%

	recipe	calories	carbohydrate	sugar	protein	category	servings	high_traffic
0	1	NaN	NaN	NaN	NaN	Potato	6	High
1	2	36.1896	39.3312	0.6732	0.9384	Breakfast	4	High
2	3	932.5656	43.5336	3.1518	2.9376	Beverages	1	NaN
3	4	98.9706	31.1712	39.4026	0.0204	One Dish Meal	4	High
4	5	27.5910	1.8870	0.8160	0.5406	One Dish Meal	4	NaN

Recipe Site Traffic Classification¶

Problem Definition¶

Section 1: Import Libraries & Data¶

1.1. Importing Necessary Libraries¶

1.2. Importing Data¶

Section 2: Data Validation¶

2.1. Data Description¶

2.2. Data Description - Observations¶

2.3. Printing Unique values¶

2.4. Unique Values - Observations¶

Section 3: Data Cleaning¶

3.1. Visualising Missing Data¶

3.2. Missing Data - Observations¶

3.3. Defining a function to clean servings column¶

3.4. Defining a function to calculate the per serving nutritional values¶

3.5. Filling missing values with the mean per category¶

Section 4: Analysis & Visualisation¶

4.1. Category Analysis¶

4.2. Category Analysis - Observations¶

4.3. Servings Analysis¶

4.4. Servings Analysis - Observations¶

4.5. Nutritional Values Analysis¶

4.6. Nutritional Values - Observations¶

Step 5: Machine Learning Preprocessing¶

5.1. Generating Histograms for Classification Preprocessing¶

5.2. Histograms - Observations¶

5.3. Copying df for ML preprocessing¶

5.4. Scaling Nutritional Columns¶

5.5. Label Encoding¶

Step 6: Final Model Selection¶

6.1. Applying LogisticRegression as first classification model¶

6.2. Applying RandomForest as second classification model¶

6.3. Comparing model precision to random choice as key performance indicator¶

Step 7: Conclusions and Recommendations¶

7.1. Conclusions¶

7.2. Recommendations¶