Creating Required Environment for the Project

  • Create conda environment :

    • conda create -n movie python=3.7 -y
  • Activate conda environment :

    • conda activate movie
  • Install required dependencies for this project :

    • pip install pandas

    • pip install nltk

    • pip install sklearn

    • pip install ast

Importing Required Modules

import ast                                                                                                                        #for converting str to list
import nltk
import pandas as pd
import pickle
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from preprocess.preprocessing import Process
from datetime import datetime

Discription of dataset

  • You can get dataset from Here.

movie dataframe

  • This dataframe contains all the information about the movies. Like budget, revenue, runtime, etc.

credits dataframe

  • This dataframe contains all the information about the movie_id, cast, movie name and crew of the movies.
movie = pd.read_csv('data/movies.csv')
credits = pd.read_csv('data/credits.csv')
movie.sample(3)
budget genres homepage id keywords original_language original_title overview popularity production_companies production_countries release_date revenue runtime spoken_languages status tagline title vote_average vote_count
1274 0 [{"id": 35, "name": "Comedy"}, {"id": 14, "nam... NaN 56715 [{"id": 4379, "name": "time travel"}, {"id": 9... en Just Visiting A knight and his valet are plagued by a witch,... 10.844882 [{"name": "Gaumont", "id": 9}, {"name": "Holly... [{"iso_3166_1": "FR", "name": "France"}, {"iso... 2001-04-06 0 88.0 [{"iso_639_1": "en", "name": "English"}, {"iso... Released They're not just from another time, they're fr... Just Visiting 4.8 171
3249 0 [{"id": 12, "name": "Adventure"}] http://www.focusfeatures.com/kicks 385736 [{"id": 3405, "name": "blow job"}, {"id": 1173... en Kicks When his hard-earned kicks get snatched by a l... 3.467923 [{"name": "Bystorm Films", "id": 2903}, {"name... [{"iso_3166_1": "US", "name": "United States o... 2016-09-09 0 80.0 [{"iso_639_1": "en", "name": "English"}] Released They aren't just shoes Kicks 7.5 18
4571 0 [] NaN 328307 [] en Rise of the Entrepreneur: The Search for a Bet... The world is changing faster than ever. Techno... 0.052942 [] [] 2014-11-20 0 0.0 [] Released NaN Rise of the Entrepreneur: The Search for a Bet... 8.0 1
credits.sample(3)
movie_id title cast crew
1275 1272 Sunshine [{"cast_id": 5, "character": "Robert Capa", "c... [{"credit_id": "5378a9080e0a261425004f95", "de...
4677 53256 Three [{"cast_id": 2, "character": "Hanna", "credit_... [{"credit_id": "52fe485bc3a36847f816358d", "de...
2741 9644 National Lampoon's Loaded Weapon 1 [{"cast_id": 12, "character": "Sgt. Jack Colt"... [{"credit_id": "52fe4515c3a36847f80bb8c9", "de...

APP_Logger Class

  • This class is responsible for recording all the steps in the textual format.
class App_Logger:
    def __init__(self):
        pass

    def log(self, file_object, log_message):
        self.now = datetime.now()
        self.date = self.now.date()
        self.current_time = self.now.strftime("%H:%M:%S")
        file_object.write(
            str(self.date) + "/" + str(self.current_time) + "\t\t" + log_message +"\n")

Helper Class

  • This class is responsible for basic NLP preprocessing of textual data in our dataframe.
class Helper:
    def __init__(self):
        self.stemmer = PorterStemmer()
        
    def convert(self, text):
        
        """
        Convert a string to a list of words.
        """
        L = []
        for i in ast.literal_eval(text):
            L.append(i['name']) 
        return L
    
    
    def merge1(self, df1, df2):
        """
        Merge two dataframe.
        """
        df1 = df1.merge(df2,on='title')
        return df1


    def convert_cast(self, text):
        """
        Convert a string to a list of words and extract cast.
        """
        L = []
        counter = 0
        for i in ast.literal_eval(text):
            if counter < 3:                                                                                                           # Here i am just keeping top 3 cast
                L.append(i['name'])
            counter+=1
        return L


    def fetch_director(self, text):
        """_
        Convert a string to a list of words and extract director.
        """
        L = []
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                L.append(i['name'])
                break
        return L


    def remove_space(self, L):
        """
        Removing space from a list. Like 'Anna Kendrick' = 'AnnaKendrick'
        """
        L1 = []
        for i in L:
            L1.append(i.replace(" ",""))
        return L1


    def stems(self, text):
        """
        Perform Stemming operation.
        """
        T = []
        
        for i in text.split():
            T.append(self.stemmer.stem(i))
        
        return " ".join(T)

Preprocessing Dataframe

  • This class is responsible for preprocessing out dataframe and making them ready for feeding the embedding class.
class Process:
    def __init__(self):
        self.helper = Helper()
        self.movie = pd.read_csv('data/movies.csv')
        self.credits = pd.read_csv('data/credits.csv')
        
        # Initializing the logger object
        self.file_object = open("Logs/preprocessing_log.txt", 'a+')
        self.log_writer = App_Logger()
        
        
    def merge_dataframe(self):
        try:
            self.first_dataframe = self.helper.merge1(self.movie, self.credits)
            return self.first_dataframe
    
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the merge_dataframe function!! Error:: %s' % ex)
            raise ex
    
    def select_columns(self):
        try:
            self.merged_dataframe = self.merge_dataframe()
            
            # Keeping important columns for recommendation
            self.merged_dataframe = self.merged_dataframe[['movie_id','title','overview','genres','keywords','cast','crew']]
            return self.merged_dataframe
            
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the select_columns function. Error:: %s' % ex)
            raise ex
            
            
    def drop_outliers(self):
        try:
            self.selected_dataframe = self.select_columns()
            
            # We have seen from our analysis there is 3 outlier in overview column, so we are dropping them.
            self.first_dataframe.dropna(inplace=True)
            return self.first_dataframe
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the drop_outliers function!! Error:: %s' % ex)
            raise ex
        
    
    def convert_to_list(self):
        try:
            self.cleaned_dataframe = self.drop_outliers()
            
            # Converting genres column to list of words
            self.cleaned_dataframe['genres'] = self.cleaned_dataframe['genres'].apply(self.helper.convert)
            #return self.movies
            
            # Converting keywords column to list of words
            self.cleaned_dataframe['keywords'] = self.cleaned_dataframe['keywords'].apply(self.helper.convert)
            
            # Converting cast column to list of words
            self.cleaned_dataframe['cast'] = self.cleaned_dataframe['cast'].apply(self.helper.convert_cast)
            
            # Fetching director name from crew column.
            self.cleaned_dataframe['crew'] = self.cleaned_dataframe['crew'].apply(self.helper.fetch_director)
            
            return self.cleaned_dataframe
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the convert_to_list function!! Error:: %s' % ex)
            raise ex
    
    
    
    def remove_space(self): 
        try:   
            self.converted_to_list_dataframe = self.convert_to_list()
        
            # Removing space from cast, crew, genres and keywords column.
            self.converted_to_list_dataframe['cast'] = self.converted_to_list_dataframe['cast'].apply(self.helper.remove_space)
            self.converted_to_list_dataframe['crew'] = self.converted_to_list_dataframe['crew'].apply(self.helper.remove_space)
            self.converted_to_list_dataframe['genres'] = self.converted_to_list_dataframe['genres'].apply(self.helper.remove_space)
            self.converted_to_list_dataframe['keywords'] = self.converted_to_list_dataframe['keywords'].apply(self.helper.remove_space)
            
            return self.converted_to_list_dataframe
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the remove_space function Error:: %s' % ex)
            raise ex
        
    
    def concat_all(self):
        try:
            self.space_removed_dataframe = self.remove_space()
            
            # Concatinate all
            self.space_removed_dataframe['tags'] = str(self.space_removed_dataframe['overview']) + str(self.space_removed_dataframe['genres']) + str(self.space_removed_dataframe['keywords']) + str(self.space_removed_dataframe['cast']) + str(self.space_removed_dataframe['crew'])
            return self.space_removed_dataframe
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the concat_all function. Error:: %s' % ex)
            raise ex
        
    def drop_columns(self):
        try:
            self.new_dataframe = self.concat_all()
            
            # Selecting only required columns and creating a new dataframe.
            self.new_dataframe = self.new_dataframe[['movie_id','title','tags']]
            return self.new_dataframe
            
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the drop_columns function. Error:: %s' % ex)
            raise ex
process = Process()
dataframe = process.drop_columns()
dataframe.sample(5)
movie_id title tags
911 187017 22 Jump Street 0 In the 22nd century, a paraplegic Mari...
131 19585 G-Force 0 In the 22nd century, a paraplegic Mari...
3814 46420 The Loved Ones 0 In the 22nd century, a paraplegic Mari...
2650 16353 Ong Bak 2 0 In the 22nd century, a paraplegic Mari...
406 51052 Arthur Christmas 0 In the 22nd century, a paraplegic Mari...

Embedding Class

  • This class is responsible for generating numerical representation of each preprocessed dataframe, and recommending movie and saving the trained model.
class Embedding:
    def __init__(self):
        # Initializing the logger object
        self.file_object = open("Logs/embedding_log.txt", 'a+')
        self.log_writer = App_Logger()
        
        self.cv = CountVectorizer(max_features=5000,stop_words='english')
        self.new_df = Process().drop_columns()
        
    def vectorize(self):
        try:
            self.vector = self.cv.fit_transform(self.new_df['tags']).toarray()
            return self.vector
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the vectorize function. Error:: %s' % ex)
            raise ex
        
    def cosine_similarity(self):
        try:
            self.vectorize_array = self.vectorize()
            self.similarity = cosine_similarity(self.vectorize_array)
            return self.similarity
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the cosine_similarity function. Error:: %s' % ex)
            raise ex
        
    def recommend(self, movie):
        try:
            self.similarity = self.cosine_similarity()
            index = self.new_df[self.new_df['title'] == movie].index[0]
            distances = sorted(list(enumerate(self.similarity[index])),reverse=True,key = lambda x: x[1])
            for i in distances[1:6]:
                print(self.new_df.iloc[i[0]].title)
                
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the recommend function. Error:: %s' % ex)
            raise ex
        
    def save_model(self):
        try:
            self.similarity = self.cosine_similarity()
            pickle.dump(self.new_df,open('trained_model/movie_list.pkl','wb'))
            pickle.dump(self.similarity,open('trained_model/similarity.pkl','wb'))
        
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the save_model function. Error:: %s' % ex)
            raise ex   
embed = Embedding()
embed.recommend('Spider-Man 2')
Pirates of the Caribbean: At World's End
Spectre
The Dark Knight Rises
John Carter
Spider-Man 3