Classify Cusine based on ingredients

This is one of the kaggle challenges , refer this link for more info on challenge

Run Ipynb on Google-colab

Details

Use recipe ingredients to categorize the cuisine. Training data consists of id, cuisine and ingredients

{
"id": 24717,
"cuisine": "indian",
"ingredients": [
    "tumeric",
    "vegetable stock",
    "tomatoes",
    "garam masala",
    "naan",
    "red lentils",
    "red chili peppers",
    "onions",
    "spinach",
    "sweet potatoes"
 ]
 }

Test data consistst of id and ingredients and we are expected to predict the cuisine

{
"id": 41580, 
"ingredients": [
    "sausage links",
    "fennel bulb",
    "fronds",
    "olive oil",
    "cuban peppers",
    "onions"
]
}

**Install the requirements **

Most of the pacakges are pre installed in the google colab.

Installed packages does not include gensim.

Installing gensim using pip3

!pip3 install gensim
import os
from os import listdir
import gensim
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

Mount google drive

Add this folder to your google drive to load the datasets from your google drive

from google.colab import drive
drive.mount('/content/gdrive')
# Print contents of mount dataset

path = '/content/gdrive/My Drive/cusines dataset/'
l = [f for f in listdir(path)]
print(l)


# Inspect contents 
inspect_test = json.load(open(path+'/'+'test.json','r'))
print(inspect_test[2])
# Load train and test sets using pandas

data = pd.read_json(path+'train.json')
test = pd.read_json(path+'test.json')

print('Training data shape: {}'.format(data.shape))
print('Test data shape: {}'.format(test.shape))
print('Dataset Keys {}'.format(data.keys()))


index = 1
print('id: {}'.format(data['id'].iloc[index]))
print('ingredients: {}'.format(data['ingredients'].iloc[index]))
print('cuisine: {}'.format(data['cuisine'].iloc[index]))

#load target labels to predict
target = data.cuisine
# Assign a new column to have the counts of each ingredients
data['ingredient_count'] = data.ingredients.apply(lambda x: len(x))

def flatten_lists(lst):
    """Remove nested lists."""
    return [item for sublist in lst for item in sublist]
# Plot figures 

f = plt.figure(figsize=(14,8))
gs = gridspec.GridSpec(2, 2)

ax1 = plt.subplot(gs[0, :])
data.ingredient_count.value_counts().hist(ax=ax1)
ax1.set_title('Recipe richness', fontsize=12)

ax2 = plt.subplot(gs[1, 0])
pd.Series(flatten_lists(list(data['ingredients']))).value_counts()[:20].plot(kind='barh', ax=ax2)
ax2.set_title('Most popular ingredients', fontsize=12)

ax3 = plt.subplot(gs[1, 1])
data.groupby('cuisine').mean()['ingredient_count'].sort_values(ascending=False).plot(kind='barh', ax=ax3)
ax3.set_title('Average number of ingredients in cuisines', fontsize=12)

plt.show()
# load word embeddings for all of the ingredients 

w2v = gensim.models.Word2Vec(list(data.ingredients), size=350, window=10, min_count=2, iter=20)

# most similar word
w2v.most_similar(['meat'])
w2v.most_similar(['salt'])
#Inspect Vocab

print(w2v.wv.vocab.keys())
print(w2v.wv['romaine lettuce'])
print(type(data.ingredients))

print(len(data.ingredients))
print(len(test.ingredients))
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in doc if word in w2v.wv.vocab]
    return np.mean(w2v[doc], axis=0)
# Adding another column to store the document embeddings
data['doc_vector'] = data.ingredients.apply(document_vector)
test['doc_vector'] = test.ingredients.apply(document_vector)
print(data['doc_vector'].iloc[1])
print(data['cuisine'].iloc[1])

lb = LabelEncoder() # Encode labels with value between 0 and n_classes-1.

y = lb.fit_transform(target) # Fit label encoder and return encoded labels
print(y)
X = list(data['doc_vector'])
X_test = list(test['doc_vector'])
# Intialize a Logistic Regression Classifier
clf = LogisticRegression(C=100) 
# C is regularization strength

#Follow this link for a primer on Logistic Regression
#https://www.kdnuggets.com/2016/08/primer-logistic-regression-part-1.html
# Train the classifier
clf.fit(X, y)

# save the model to disk
filename = 'finalized_model.sav'
# pickle.dump(clf, open(path + '/'+filename, 'wb'))
 
 
# load the model from disk
clf = pickle.load(open(path + '/'+ filename, 'rb'))
def predict(ingredient_list):
  """Predict cusine based on ingredient list"""
  doc_vector = document_vector(ingredient_list)
  y_test = clf.predict([doc_vector])
  y_pred = lb.inverse_transform(y_test)
  return y_pred
  
  
# ingredient_list = ['sausage links','fennel bulb','fronds','olive oil','cuban peppers','onions', 'salt']
ingredient_list = ['plain flour', 'cheese', 'ground pepper', 'salt', 'tomatoes', 'ground black pepper', 'thyme', 'eggs', 'green tomatoes', 'yellow corn meal', 'milk', 'vegetable oil']
print('Cusine ',predict(ingredient_list))
# !ls -alh
!ls gdrive/My\ Drive/
# X_test1 = X_test[:1]
# 
y_test = clf.predict(X_test)
y_pred = lb.inverse_transform(y_test)
print(y_pred)
test_id = [id_ for id_ in test.id]
sub = pd.DataFrame({'id': test_id, 'cuisine': y_pred}, columns=['id', 'cuisine'])
sub.to_csv(path + '/'+'clf_output.csv', index=False)