# Imports the libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
from scipy.stats import ttest_1samp, pearsonr, chisquare


drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/320/netflix_titles.csv')
df.head()

Mounted at /content/drive

df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

def extract_duration(duration, show_type):
    if pd.isna(duration):
      # Handles the missing values
        return np.nan
    if show_type == 'Movie':
      # Extracts the minutes if movie
        return int(duration.split()[0])
    elif show_type == 'TV Show':
      # Extracts the seasons if show
        return int(duration.split()[0])
    else:
        return np.nan

df['duration_num'] = df.apply(lambda row: extract_duration(row['duration'], row['type']), axis=1)
# Splitting the data set into movies and tv shows dataframes!
movies = df[df['type'] == 'Movie'].copy()
tv_shows = df[df['type'] == 'TV Show'].copy()

# Descriptive Statistics and Data Exploration
print("Dataset Information:")
print(df.info())

print("\nThe number of missing values per column:")
print(df.isnull().sum())

print("\nCount of content types:")
print(df['type'].value_counts())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8807 non-null   object        
 1   type          8807 non-null   object        
 2   title         8807 non-null   object        
 3   director      6173 non-null   object        
 4   cast          7982 non-null   object        
 5   country       7976 non-null   object        
 6   date_added    8709 non-null   datetime64[ns]
 7   release_year  8807 non-null   int64         
 8   rating        8803 non-null   object        
 9   duration      8804 non-null   object        
 10  listed_in     8807 non-null   object        
 11  description   8807 non-null   object        
 12  duration_num  8804 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(10)
memory usage: 894.6+ KB
None

The number of missing values per column:
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        98
release_year       0
rating             4
duration           3
listed_in          0
description        0
duration_num       3
dtype: int64

Count of content types:
type
Movie      6131
TV Show    2676
Name: count, dtype: int64

# Counts the occurences of the movies and tv shows
observed = df['type'].value_counts()
expected = [len(df)/2, len(df)/2]

# Performs the chi-square test to compared the observed and expected frequencies
chi_square, p = chisquare(observed, f_exp=expected)
print("Chi square Test Results:")
print(f"Observed counts: Movies={observed.iloc[0]}, TV Shows={observed.iloc[1]}")
print(f"Chi square statistic: {chi_square:.2f}")
print(f"Chi square p-value: {p:.4f}")


# This makes the distribution using desriptive statistics and the chi-square test!
plt.figure(figsize=(6,4))
observed.plot(kind='bar', color=['blue', 'green'])
plt.title('Distribution of Netflix Titles by Type')
plt.xticks(rotation=0)
plt.xlabel('Type')
plt.ylabel('Count')
plt.show()

Chi square Test Results:
Observed counts: Movies=6131, TV Shows=2676
Chi square statistic: 1355.40
Chi square p-value: 0.0000

# Extracts the movie durations and removes missing values
movie_durations = movies['duration_num'].dropna()
# Computes the mean, standartd deviation, and total number of movies
mean_duration = movie_durations.mean()
std_duration = movie_durations.std()
num_movies = movie_durations.count()

print("Movie Duration Statistics:")
print("Mean duration in minutes:", mean_duration)
print("The standard deviation is:", std_duration)
print("Total number of movies:", num_movies)

# Performs the one sample t-test
t_stat, p_value = ttest_1samp(movie_durations, 90)
print("\nOne-sample t test results:")
print("t-statistic:", t_stat)
print("p-value:", p_value)

Movie Duration Statistics:
Mean duration in minutes: 99.57718668407311
The standard deviation is: 28.290593447417347
Total number of movies: 6128

One-sample t test results:
t-statistic: 26.500573562871065
p-value: 1.342311985065777e-146

# Plots the histogram of movie durations
plt.figure(figsize=(8,5))
plt.hist(movie_durations, bins=30, color='red', edgecolor='black', alpha=0.7)
plt.axvline(mean_duration, color='black', linestyle='dashed', linewidth=2, label=f'Mean: {mean_duration:.1f} min')
plt.legend()
plt.xlabel('Duration (Minutes)')
plt.ylabel('Frequency')
plt.title('Histogram of Movie Durations')
plt.show()

# Prepare data for correlation analysis
movies_correlation = movies[['release_year', 'duration_num']].dropna()
# Calculates the Pearson correlation coefficient and the p-value
pearson_corr_coef, pearson_p_value = pearsonr(movies_correlation['release_year'], movies_correlation['duration_num'])
print("Correlation Analysis:")
print("Pearson correlation coefficient:", pearson_corr_coef)
print("p-value:", pearson_p_value)

Correlation Analysis:
Pearson correlation coefficient: -0.20628474681965608
p-value: 7.017657140476846e-60

# Makes the scatter plot with accurate labels and description
plt.figure(figsize=(8,5))
plt.scatter(movies_correlation['release_year'], movies_correlation['duration_num'], alpha=0.5, color='purple')
plt.title('Scatter Plot: Release Year vs Movie Duration')
plt.xlabel('Release Year')
plt.ylabel('Movie Duration (minutes)')
plt.show()

# Define the binary target
kids_ratings = ["G", "TV-Y", "TV-Y7"]
df = df.copy()
# Drop missing ratings
df = df[df['rating'].notna()]
# Creates column that shows 1 if title has a kid rating and 0 if false
df['is_kids'] = df['rating'].isin(kids_ratings).astype(int)
# Computes the numeric features
df['duration_int'] = pd.to_numeric(df['duration'].str.extract(r'(\d+)').iloc[:,0], errors='coerce').fillna(0).astype(int)
df['cast_count']   = df['cast'].fillna('').apply(lambda x: len(x.split(',')))
df['desc_len']     = df['description'].str.len()

# Prepares X and y
# This creates column of features for predicting if content will have kid rating
feature_cols = ['release_year', 'duration_int', 'cast_count', 'desc_len']
X = df[feature_cols]
y = df['is_kids']

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Split while preserving class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.20, random_state=42
)

# These are to instantiate the models
rf  = RandomForestClassifier(n_estimators=100, random_state=42)
gb  = GradientBoostingClassifier(random_state=42)
svm = SVC(kernel='rbf', probability=True, random_state=42)

# Now we fit each of the variables we just made
rf.fit(X_train, y_train)
gb.fit(X_train, y_train)
svm.fit(X_train, y_train)

SVC(probability=True, random_state=42)

SVC(probability=True, random_state=42)

from sklearn.metrics import accuracy_score, classification_report
# Loops over the list of models along with their names to find the accuracies!
for name, model in [('RandomForest', rf), ('GBM', gb), ('SVM', svm)]:
    y_pred = model.predict(X_test)
    acc    = accuracy_score(y_test, y_pred)
    print(f"{name} accuracy: {acc:.3f}")
    print(classification_report(y_test, y_pred))
    print("-" * 40)

RandomForest accuracy: 0.922
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1625
           1       0.48      0.21      0.30       136

    accuracy                           0.92      1761
   macro avg       0.71      0.60      0.63      1761
weighted avg       0.90      0.92      0.91      1761

----------------------------------------
GBM accuracy: 0.928
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      1625
           1       0.67      0.13      0.22       136

    accuracy                           0.93      1761
   macro avg       0.80      0.56      0.59      1761
weighted avg       0.91      0.93      0.90      1761

----------------------------------------
SVM accuracy: 0.923
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      1625
           1       0.00      0.00      0.00       136

    accuracy                           0.92      1761
   macro avg       0.46      0.50      0.48      1761
weighted avg       0.85      0.92      0.89      1761

----------------------------------------

/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

import seaborn as sns
from sklearn.metrics import (
    roc_curve, auc,
    RocCurveDisplay,
    ConfusionMatrixDisplay, confusion_matrix
)

# Set the size of the figure for the ROC curve
plt.figure(figsize=(6, 6))
# Loop through each model to find the ROC curve
for name, clf in [("Random Forest", rf), ("Gradient Boosting", gb), ("SVM (RBF)", svm)]:
    y_prob = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.plot(fpr, tpr,
             label=f"{name} (AUC = {auc(fpr, tpr):.3f})")
plt.plot([0, 1], [0, 1], "--", lw=1, color="grey")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – Kids vs. Non-Kids")
plt.legend()
plt.tight_layout()
plt.show()

# Confusion Matrix
# Choosing the best performing model
best_model = gb
y_pred = best_model.predict(X_test)
# Makes predictions on the test set
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred,
    cmap="Blues",
    display_labels=["Non-Kids", "Kids"]
)
plt.title("Confusion Matrix – Gradient Boosting (threshold 0.5)")
plt.tight_layout()
plt.show()


feat_imp = pd.Series(gb.feature_importances_, index=feature_cols)
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Gradient Boosting – Feature Importance")
plt.xlabel("Relative importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

	show_id	type	title	director	cast	country	date_added	release_year	rating	duration	listed_in	description
0	s1	Movie	Dick Johnson Is Dead	Kirsten Johnson	NaN	United States	September 25, 2021	2020	PG-13	90 min	Documentaries	As her father nears the end of his life, filmm...
1	s2	TV Show	Blood & Water	NaN	Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...	South Africa	September 24, 2021	2021	TV-MA	2 Seasons	International TV Shows, TV Dramas, TV Mysteries	After crossing paths at a party, a Cape Town t...
2	s3	TV Show	Ganglands	Julien Leclercq	Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...	NaN	September 24, 2021	2021	TV-MA	1 Season	Crime TV Shows, International TV Shows, TV Act...	To protect his family from a powerful drug lor...
3	s4	TV Show	Jailbirds New Orleans	NaN	NaN	NaN	September 24, 2021	2021	TV-MA	1 Season	Docuseries, Reality TV	Feuds, flirtations and toilet talk go down amo...
4	s5	TV Show	Kota Factory	NaN	Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...	India	September 24, 2021	2021	TV-MA	2 Seasons	International TV Shows, Romantic TV Shows, TV ...	In a city of coaching centers known to train I...

Analysis on Netflix's Worldwide Movies and TV Shows in 2021¶

Introduction¶

Data Curation¶

Data Preprocessing¶

Exploratory Data Analysis¶

Hypothesis Testing on the Content Type Distribution¶

Hypothesis Testing on Movie Duration¶

Correlation Analysis: Release Year vs Movie Duration¶

Primary Analysis¶

Model: Machine Learning and Analysis¶

Visualization¶

Insights and Conclusion¶

Analysis on Netflix's Worldwide Movies and TV Shows in 2021¶

Introduction¶

Data Curation¶

Data Preprocessing¶

Exploratory Data Analysis¶

Hypothesis Testing on the Content Type Distribution¶

Hypothesis Testing on Movie Duration¶

Correlation Analysis: Release Year vs Movie Duration¶

Primary Analysis¶

Model: Machine Learning and Analysis¶

Visualization¶

Insights and Conclusion¶

Related Links¶