import pandas as pd
import numpy as np
import seaborn as sns
from plotnine import ggplot,geom_bar, aes
import matplotlib.pyplot as plt
%matplotlib inline
import missingno as msno
from collections import Counter


netflix_df = pd.read_csv('netflix_titles.csv')
netflix_df.head()


netflix_df.shape

(8807, 12)


msno.matrix(netflix_df)
plt.show()


print('\nColumns with missing value:') 
print(netflix_df.isnull().any())

Columns with missing value:
show_id         False
type            False
title           False
director         True
cast             True
country          True
date_added       True
release_year    False
rating           True
duration         True
listed_in       False
description     False
dtype: bool


(netflix_df.isnull().mean()*100).sort_values(ascending=False)[:6]

director      29.908028
country        9.435676
cast           9.367549
date_added     0.113546
rating         0.045418
duration       0.034064
dtype: float64


netflix_df.director.fillna("No Director", inplace=True)
netflix_df.cast.fillna("No Cast", inplace=True)
netflix_df.country.fillna("Country Unavailable", inplace=True)


netflix_df.dropna(subset=["date_added", "rating", "duration"], inplace=True)


print('\nColumns with missing value:') 
print(netflix_df.isnull().any())

Columns with missing value:
show_id         False
type            False
title           False
director        False
cast            False
country         False
date_added      False
release_year    False
rating          False
duration        False
listed_in       False
description     False
dtype: bool


netflix_df.shape

(8790, 12)


plt.figure(figsize=(12,6))
plt.title("Percentage of Netflix Titles as either Movies or TV Shows")
plt.pie(netflix_df.type.value_counts(),explode=(0.01,0.01),labels=netflix_df.type.value_counts().index, colors=['#b1a7a6',"#a4161a"],autopct="%1.2f%%")
plt.show()


netflix_df.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


netflix_df['date_added'] =  pd.to_datetime(netflix_df['date_added'])
netflix_df.head()


min(pd.DatetimeIndex(netflix_df['date_added']).year)

2008


netflix_df.groupby([pd.DatetimeIndex(netflix_df['date_added']).year, 'type'])['type'].count().unstack(level=1).plot(kind='line', figsize=(15, 8), color =['#b1a7a6','#a4161a'], linewidth = 4)
plt.xlim([2008,2021])
plt.xticks(np.arange(2008, 2022, step=1))
plt.show()


genre = netflix_df['listed_in']
seperated_genre = ','.join(genre).replace(' ,',',').replace(', ',',').split(',')
genre_count = pd.Series(dict(Counter(seperated_genre))).sort_values(ascending=False)
genre_count

International Movies            2752
Dramas                          2426
Comedies                        1674
International TV Shows          1349
Documentaries                    869
Action & Adventure               859
TV Dramas                        762
Independent Movies               756
Children & Family Movies         641
Romantic Movies                  616
Thrillers                        577
TV Comedies                      573
Crime TV Shows                   469
Kids' TV                         448
Docuseries                       394
Music & Musicals                 375
Romantic TV Shows                370
Horror Movies                    357
Stand-Up Comedy                  343
Reality TV                       255
British TV Shows                 252
Sci-Fi & Fantasy                 243
Sports Movies                    219
Anime Series                     174
Spanish-Language TV Shows        173
TV Action & Adventure            167
Korean TV Shows                  151
Classic Movies                   116
LGBTQ Movies                     102
TV Mysteries                      98
Science & Nature TV               92
TV Sci-Fi & Fantasy               83
TV Horror                         75
Anime Features                    71
Cult Movies                       71
Teen TV Shows                     69
Faith & Spirituality              65
TV Thrillers                      57
Stand-Up Comedy & Talk Shows      56
Movies                            53
Classic & Cult TV                 26
TV Shows                          16
dtype: int64


genre_top = genre_count[:20]
plt.figure(figsize=(20,12))
sns.barplot(genre_top, genre_top.index, palette="RdGy")
plt.show()

C:\Users\PC\AppData\Local\Programs\Python\Python39\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.


netflix_genre_country = pd.DataFrame([netflix_df['country'].apply(lambda x: x.split(',')[0]), netflix_df['listed_in']])
netflix_genre_country_t = netflix_genre_country.T
netflix_df_exploded = netflix_genre_country_t.set_index(['country']).apply(lambda x: x.str.split(',').explode()).reset_index()
country_count_df = netflix_df_exploded.value_counts().rename_axis().reset_index(name='counts')
country_count_df


sa_count = country_count_df.loc[country_count_df['country'] == 'Saudi Arabia'].reset_index(drop=True)
del sa_count['country']
sa_count


sa_count.plot.bar(x = 'listed_in',y = 'counts', color = '#a4161a')
plt.title("Most popular genre produced by Saudi Arabia on Netflix")
plt.xlabel("Genre")
plt.show()


country_count=netflix_df['country'].value_counts().sort_values(ascending=False)
country_count=pd.DataFrame(country_count)
topcountries=country_count[0:14]
topcountries


topcountries.plot.bar(color = '#a4161a')
plt.title("Country with most content produced on Netflix")
plt.xlabel("Country")
plt.legend([]) 
plt.show()


netflix_date_df = pd.DataFrame()
netflix_date_df['content_added_month'] = netflix_df['date_added'].dt.month
netflix_date_df['type'] = netflix_df['type']
"""netflix_date_df['content_added_month'] = netflix_date_df['content_added_month'].map({
    1: 'January', 2: 'February', 3: 'March', 4: "April", 5: "May", 6: "June", 7: "July", 8: "August", 9: "September", 10: "October", 11: "November", 12: "December"})"""
netflix_date_df


netflix_date_df.groupby(['content_added_month', 'type'])['type'].count().unstack(level=1).sort_values('content_added_month', ascending = True).plot(kind='bar', subplots=False, figsize=(15, 8), colormap="RdGy")
my_xticks = ["January", "February", "March", "April", "May", "June", "July", "August",  "September", "October", "November", "December"]
y_pos = np.arange(len(my_xticks))
plt.xticks(y_pos, my_xticks, rotation=45, horizontalalignment='right')
plt.show()

	show_id	type	title	director	cast	country	date_added	release_year	rating	duration	listed_in	description
0	s1	Movie	Dick Johnson Is Dead	Kirsten Johnson	NaN	United States	September 25, 2021	2020	PG-13	90 min	Documentaries	As her father nears the end of his life, filmm...
1	s2	TV Show	Blood & Water	NaN	Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...	South Africa	September 24, 2021	2021	TV-MA	2 Seasons	International TV Shows, TV Dramas, TV Mysteries	After crossing paths at a party, a Cape Town t...
2	s3	TV Show	Ganglands	Julien Leclercq	Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...	NaN	September 24, 2021	2021	TV-MA	1 Season	Crime TV Shows, International TV Shows, TV Act...	To protect his family from a powerful drug lor...
3	s4	TV Show	Jailbirds New Orleans	NaN	NaN	NaN	September 24, 2021	2021	TV-MA	1 Season	Docuseries, Reality TV	Feuds, flirtations and toilet talk go down amo...
4	s5	TV Show	Kota Factory	NaN	Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...	India	September 24, 2021	2021	TV-MA	2 Seasons	International TV Shows, Romantic TV Shows, TV ...	In a city of coaching centers known to train I...

	show_id	type	title	director	cast	country	date_added	release_year	rating	duration	listed_in	description
0	s1	Movie	Dick Johnson Is Dead	Kirsten Johnson	No Cast	United States	2021-09-25	2020	PG-13	90 min	Documentaries	As her father nears the end of his life, filmm...
1	s2	TV Show	Blood & Water	No Director	Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...	South Africa	2021-09-24	2021	TV-MA	2 Seasons	International TV Shows, TV Dramas, TV Mysteries	After crossing paths at a party, a Cape Town t...
2	s3	TV Show	Ganglands	Julien Leclercq	Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...	Country Unavailable	2021-09-24	2021	TV-MA	1 Season	Crime TV Shows, International TV Shows, TV Act...	To protect his family from a powerful drug lor...
3	s4	TV Show	Jailbirds New Orleans	No Director	No Cast	Country Unavailable	2021-09-24	2021	TV-MA	1 Season	Docuseries, Reality TV	Feuds, flirtations and toilet talk go down amo...
4	s5	TV Show	Kota Factory	No Director	Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...	India	2021-09-24	2021	TV-MA	2 Seasons	International TV Shows, Romantic TV Shows, TV ...	In a city of coaching centers known to train I...

	country	listed_in	counts
0	India	International Movies	807
1	United States	Documentaries	429
2	United States	Dramas	429
3	India	Dramas	404
4	United States	Comedies	374
...	...	...	...
1436	Mexico	Thrillers	1
1437	Mexico	Classic Movies	1
1438	Mexico	Docuseries	1
1439	Mexico	International Movies	1
1440	Zimbabwe	Comedies	1

	country
United States	2809
India	972
Country Unavailable	829
United Kingdom	418
Japan	243
South Korea	199
Canada	181
Spain	145
France	124
Mexico	110
Egypt	106
Turkey	105
Nigeria	95
Australia	85

Netflix EDA and Visualization¶

Background¶

About the Data¶

Importing required packages¶

Loading the dataset¶

Cleaning the data¶

EDA and Visualization¶

What is Saudi Arabia's top genre?¶

Which country produces the most content?¶

What's the best month to release content?¶

Conclusion¶

	listed_in	counts
0	International Movies	7
1	Comedies	5
2	International TV Shows	3
3	Romantic Movies	2
4	Dramas	2
5	Independent Movies	2
6	TV Dramas	2
7	TV Shows	1
8	Dramas	1
9	TV Comedies	1