Creating Required Environment for the Project

Create conda environment :
- conda create -n movie python=3.7 -y
Activate conda environment :
- conda activate movie
Install required dependencies for this project :
- pip install pandas
- pip install nltk
- pip install sklearn
- pip install ast

Importing Required Modules

import ast                                                                                                                        #for converting str to list
import nltk
import pandas as pd
import pickle
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from preprocess.preprocessing import Process
from datetime import datetime

Discription of dataset

You can get dataset from Here.

movie dataframe

This dataframe contains all the information about the movies. Like budget, revenue, runtime, etc.

credits dataframe

This dataframe contains all the information about the movie_id, cast, movie name and crew of the movies.

movie = pd.read_csv('data/movies.csv')
credits = pd.read_csv('data/credits.csv')

movie.sample(3)

credits.sample(3)

APP_Logger Class

This class is responsible for recording all the steps in the textual format.

class App_Logger:
    def __init__(self):
        pass

    def log(self, file_object, log_message):
        self.now = datetime.now()
        self.date = self.now.date()
        self.current_time = self.now.strftime("%H:%M:%S")
        file_object.write(
            str(self.date) + "/" + str(self.current_time) + "\t\t" + log_message +"\n")

Helper Class

This class is responsible for basic NLP preprocessing of textual data in our dataframe.

class Helper:
    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def convert(self, text):
        
        """
        Convert a string to a list of words.
        """
        L = []
        for i in ast.literal_eval(text):
            L.append(i['name']) 
        return L
    
    
    def merge1(self, df1, df2):
        """
        Merge two dataframe.
        """
        df1 = df1.merge(df2,on='title')
        return df1


    def convert_cast(self, text):
        """
        Convert a string to a list of words and extract cast.
        """
        L = []
        counter = 0
        for i in ast.literal_eval(text):
            if counter < 3:                                                                                                           # Here i am just keeping top 3 cast
                L.append(i['name'])
            counter+=1
        return L


    def fetch_director(self, text):
        """_
        Convert a string to a list of words and extract director.
        """
        L = []
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                L.append(i['name'])
                break
        return L


    def remove_space(self, L):
        """
        Removing space from a list. Like 'Anna Kendrick' = 'AnnaKendrick'
        """
        L1 = []
        for i in L:
            L1.append(i.replace(" ",""))
        return L1


    def stems(self, text):
        """
        Perform Stemming operation.
        """
        T = []
        
        for i in text.split():
            T.append(self.stemmer.stem(i))
        
        return " ".join(T)

Preprocessing Dataframe

This class is responsible for preprocessing out dataframe and making them ready for feeding the embedding class.

class Process:
    def __init__(self):
        self.helper = Helper()
        self.movie = pd.read_csv('data/movies.csv')
        self.credits = pd.read_csv('data/credits.csv')
        
        # Initializing the logger object
        self.file_object = open("Logs/preprocessing_log.txt", 'a+')
        self.log_writer = App_Logger()
        
        
    def merge_dataframe(self):
        try:
            self.first_dataframe = self.helper.merge1(self.movie, self.credits)
            return self.first_dataframe
    
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the merge_dataframe function!! Error:: %s' % ex)
            raise ex
    
    def select_columns(self):
        try:
            self.merged_dataframe = self.merge_dataframe()
            
            # Keeping important columns for recommendation
            self.merged_dataframe = self.merged_dataframe[['movie_id','title','overview','genres','keywords','cast','crew']]
            return self.merged_dataframe
            
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the select_columns function. Error:: %s' % ex)
            raise ex
            
            
    def drop_outliers(self):
        try:
            self.selected_dataframe = self.select_columns()
            
            # We have seen from our analysis there is 3 outlier in overview column, so we are dropping them.
            self.first_dataframe.dropna(inplace=True)
            return self.first_dataframe
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the drop_outliers function!! Error:: %s' % ex)
            raise ex
        
    
    def convert_to_list(self):
        try:
            self.cleaned_dataframe = self.drop_outliers()
            
            # Converting genres column to list of words
            self.cleaned_dataframe['genres'] = self.cleaned_dataframe['genres'].apply(self.helper.convert)
            #return self.movies
            
            # Converting keywords column to list of words
            self.cleaned_dataframe['keywords'] = self.cleaned_dataframe['keywords'].apply(self.helper.convert)
            
            # Converting cast column to list of words
            self.cleaned_dataframe['cast'] = self.cleaned_dataframe['cast'].apply(self.helper.convert_cast)
            
            # Fetching director name from crew column.
            self.cleaned_dataframe['crew'] = self.cleaned_dataframe['crew'].apply(self.helper.fetch_director)
            
            return self.cleaned_dataframe
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the convert_to_list function!! Error:: %s' % ex)
            raise ex
    
    
    
    def remove_space(self): 
        try:   
            self.converted_to_list_dataframe = self.convert_to_list()
        
            # Removing space from cast, crew, genres and keywords column.
            self.converted_to_list_dataframe['cast'] = self.converted_to_list_dataframe['cast'].apply(self.helper.remove_space)
            self.converted_to_list_dataframe['crew'] = self.converted_to_list_dataframe['crew'].apply(self.helper.remove_space)
            self.converted_to_list_dataframe['genres'] = self.converted_to_list_dataframe['genres'].apply(self.helper.remove_space)
            self.converted_to_list_dataframe['keywords'] = self.converted_to_list_dataframe['keywords'].apply(self.helper.remove_space)
            
            return self.converted_to_list_dataframe
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the remove_space function Error:: %s' % ex)
            raise ex
        
    
    def concat_all(self):
        try:
            self.space_removed_dataframe = self.remove_space()
            
            # Concatinate all
            self.space_removed_dataframe['tags'] = str(self.space_removed_dataframe['overview']) + str(self.space_removed_dataframe['genres']) + str(self.space_removed_dataframe['keywords']) + str(self.space_removed_dataframe['cast']) + str(self.space_removed_dataframe['crew'])
            return self.space_removed_dataframe
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the concat_all function. Error:: %s' % ex)
            raise ex
        
    def drop_columns(self):
        try:
            self.new_dataframe = self.concat_all()
            
            # Selecting only required columns and creating a new dataframe.
            self.new_dataframe = self.new_dataframe[['movie_id','title','tags']]
            return self.new_dataframe
            
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the drop_columns function. Error:: %s' % ex)
            raise ex

process = Process()
dataframe = process.drop_columns()
dataframe.sample(5)

Embedding Class

This class is responsible for generating numerical representation of each preprocessed dataframe, and recommending movie and saving the trained model.

class Embedding:
    def __init__(self):
        # Initializing the logger object
        self.file_object = open("Logs/embedding_log.txt", 'a+')
        self.log_writer = App_Logger()
        
        self.cv = CountVectorizer(max_features=5000,stop_words='english')
        self.new_df = Process().drop_columns()
        
    def vectorize(self):
        try:
            self.vector = self.cv.fit_transform(self.new_df['tags']).toarray()
            return self.vector
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the vectorize function. Error:: %s' % ex)
            raise ex
        
    def cosine_similarity(self):
        try:
            self.vectorize_array = self.vectorize()
            self.similarity = cosine_similarity(self.vectorize_array)
            return self.similarity
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the cosine_similarity function. Error:: %s' % ex)
            raise ex
        
    def recommend(self, movie):
        try:
            self.similarity = self.cosine_similarity()
            index = self.new_df[self.new_df['title'] == movie].index[0]
            distances = sorted(list(enumerate(self.similarity[index])),reverse=True,key = lambda x: x[1])
            for i in distances[1:6]:
                print(self.new_df.iloc[i[0]].title)
                
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the recommend function. Error:: %s' % ex)
            raise ex
        
    def save_model(self):
        try:
            self.similarity = self.cosine_similarity()
            pickle.dump(self.new_df,open('trained_model/movie_list.pkl','wb'))
            pickle.dump(self.similarity,open('trained_model/similarity.pkl','wb'))
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the save_model function. Error:: %s' % ex)
            raise ex

embed = Embedding()
embed.recommend('Spider-Man 2')

Pirates of the Caribbean: At World's End
Spectre
The Dark Knight Rises
John Carter
Spider-Man 3

	genres	homepage	id	keywords	original_language	original_title	overview	popularity	production_companies	production_countries	release_date	runtime	spoken_languages	status	tagline	title	vote_average	vote_count
1274	[{"id": 35, "name": "Comedy"}, {"id": 14, "nam...	NaN	56715	[{"id": 4379, "name": "time travel"}, {"id": 9...	en	Just Visiting	A knight and his valet are plagued by a witch,...	10.844882	[{"name": "Gaumont", "id": 9}, {"name": "Holly...	[{"iso_3166_1": "FR", "name": "France"}, {"iso...	2001-04-06	88.0	[{"iso_639_1": "en", "name": "English"}, {"iso...	Released	They're not just from another time, they're fr...	Just Visiting	4.8	171
3249	[{"id": 12, "name": "Adventure"}]	http://www.focusfeatures.com/kicks	385736	[{"id": 3405, "name": "blow job"}, {"id": 1173...	en	Kicks	When his hard-earned kicks get snatched by a l...	3.467923	[{"name": "Bystorm Films", "id": 2903}, {"name...	[{"iso_3166_1": "US", "name": "United States o...	2016-09-09	80.0	[{"iso_639_1": "en", "name": "English"}]	Released	They aren't just shoes	Kicks	7.5	18
4571	[]	NaN	328307	[]	en	Rise of the Entrepreneur: The Search for a Bet...	The world is changing faster than ever. Techno...	0.052942	[]	[]	2014-11-20	0.0	[]	Released	NaN	Rise of the Entrepreneur: The Search for a Bet...	8.0	1

	movie_id	title	cast	crew
1275	1272	Sunshine	[{"cast_id": 5, "character": "Robert Capa", "c...	[{"credit_id": "5378a9080e0a261425004f95", "de...
4677	53256	Three	[{"cast_id": 2, "character": "Hanna", "credit_...	[{"credit_id": "52fe485bc3a36847f816358d", "de...
2741	9644	National Lampoon's Loaded Weapon 1	[{"cast_id": 12, "character": "Sgt. Jack Colt"...	[{"credit_id": "52fe4515c3a36847f80bb8c9", "de...

	movie_id	title	tags
911	187017	22 Jump Street	0 In the 22nd century, a paraplegic Mari...
131	19585	G-Force	0 In the 22nd century, a paraplegic Mari...
3814	46420	The Loved Ones	0 In the 22nd century, a paraplegic Mari...
2650	16353	Ong Bak 2	0 In the 22nd century, a paraplegic Mari...
406	51052	Arthur Christmas	0 In the 22nd century, a paraplegic Mari...