- 18th Aug 2022
- 06:03 am
---
title: "Recommender"
output: html_document
date: '2022-06-16'
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## R Markdown
```{r message=FALSE, warning=FALSE}
library(tidyverse)
library(caret)
library(stats)
library(mltools)
library(Metrics)
library(data.table)
library(tools)
library(stringi)
library(textclean)
library(fastDummies)
library(shiny)
library(cluster)
library(bslib)
library(knitr)
```
```{r results='asis'}
netflix = read.csv('netflix_titles.csv')
kable(head(netflix[1:6]))
```
```{r}
release_year=netflix$release_year
hist(release_year, col="lightblue", ylim=c(0,200))
```
```{r}
x<-sample((netflix$type),100)
x<-factor(x)
t<-table(x); barplot(t/sum(t)*100,ylab="Per cent")
```
```{r}
# Subset the data to only get the main director
netflix_director = as.data.frame(str_split(netflix$director, ",", simplify = TRUE)[,1])
colnames(netflix_director) = "lead_director"
netflix_director = netflix_director %>% mutate(lead_director = ifelse(lead_director == "", "No Director", lead_director))
# Subset the data to only get the lead cast member
netflix_lead = as.data.frame(str_split(netflix$cast, ",", simplify = TRUE)[,1])
colnames(netflix_lead) = "cast_lead"
netflix_lead = netflix_lead %>% mutate(cast_lead = ifelse(cast_lead == "", "Unknown Cast", cast_lead))
# split up the genres into individual columns
# Check to see what kind of genres am I dealing with:
unique_genres = c(unique(str_split(netflix$listed_in, ",", simplify = TRUE)[,1]), unique(str_split(netflix$listed_in, ",", simplify = TRUE)[,2]), unique(str_split(netflix$listed_in, ",", simplify = TRUE)[,3]))
netflix_english_content = netflix %>%
select(title, listed_in) %>%
mutate(
non_english_content = ifelse(str_detect(listed_in, "(.+/s)?British TV Shows(.+/s)?") == TRUE, FALSE,
ifelse(str_detect(listed_in, "(.+/s)?International TV Shows(.+/s)?|(.+/s)?International Movies(.+/s)?|(.+/s)?Anime Series(.+/s)?|(.+/s)?Anime Features(.+/s)?|(.+/s)?Korean TV Shows(.+/s)?|(.+/s)?Spanish-Language TV Shows(.+/s)?") == TRUE, TRUE, FALSE))
)
netflix_english_nation = netflix %>% select(title, country) %>%
mutate(
in_english_nation = ifelse(str_detect(country, "(.+/s)?United States(.+/s)?|(.+/s)?United Kingdom(.+/s)?|(.+/s)?Canada(.+/s)?|(.+/s)?Australia(.+/s)?|(.+/s)?New Zealand(.+/s)?|(.+/s)?Ireland(.+/s)?|(.+/s)?Jamaica(.+/s)?|(.+/s)?Barbados(.+/s)?") == TRUE, TRUE, FALSE)
)
# Figure out which content is English or Not
# Will be based on Title, Genre, Country
# I need to make a function to be able to determine if a character is ASCII or not .... use for Title
is_not_english = function(string){
some_count = 0 # Give a running count for any instance of non-ASCII character
for(char in str_split(string, boundary("character"))[[1]]){
if(str_detect(char, "[A-Za-z0-9 \\*\\!\\(\\):,&\\@\\'\\%\\.\\?\\%\\-]") == F){
some_count = some_count + 1
}
}
outcome = ifelse(some_count >= 2, "Likely Not English", "Likely English") # characters with at least 2 non-ASCII character = likely not English
return(outcome)
}
# Apply this function to tell if a title is a suspected non-English content
netflix_cleaning = cbind(as.data.frame(unlist(map(netflix$title, is_not_english))) %>% rename(is_english = "unlist(map(netflix$title, is_not_english))"), netflix)
# Determining if content is really English or Not
# If netflix_english_content has non_english_content == TRUE --> For sure not English
# Looking at country + Title, if netflix_english_nation == TRUE + Netflix_english_content$non_english_content == FALSE + Netflix_cleaning$is_english == "Likely English" --> English
# netflix_english_nation == FALSE + Netflix_english_content$non_english_content == FALSE + Netflix_cleaning$is_english == "Likely English"/"Not Likely English" --> Not English
english_check = cbind(netflix_english_nation %>% select(in_english_nation), netflix_english_content %>% select(non_english_content), netflix_cleaning)
final_english_check = english_check %>%
select(title ,non_english_content, is_english, in_english_nation, cast, director, description) %>%
mutate(
assume_english = ifelse(c(non_english_content == T & in_english_nation == T & is_english == "Likely English"), "no",
ifelse(c(non_english_content == T & in_english_nation == F & is_english == "Likely English"), "no",
ifelse(c(non_english_content == T & in_english_nation == T & is_english == "Likely Not English"), "no",
ifelse(c(non_english_content == T & in_english_nation == F & is_english == "Likely Not English"), "no",
ifelse(c(non_english_content == F & in_english_nation == T & is_english == "Likely English"), "yes",
ifelse(c(non_english_content == F & in_english_nation == F & is_english == "Likely English"), "no",
ifelse(c(non_english_content == F & in_english_nation == T & is_english == "Likely Not English"), "no", "no")))))))
)
english_check = as.data.frame(final_english_check$assume_english)
netflix_english = cbind(netflix, english_check)
netflix_english = netflix_english %>% rename(is_english = "final_english_check$assume_english")
# Determine if a movie is modern or not; well go with the year 2000 and older = modern movie
netflix_modern_english = netflix_english %>% mutate(is_modern = ifelse(release_year > 1999, 1, 0))
# Combine the individual data frame about lead director, cast lead and whether the content is likely English or not into 1 single data frame
netflix_combine = cbind(as.data.frame(netflix_director$lead_director), as.data.frame(netflix_lead$cast_lead), netflix_modern_english)
netflix_combine = netflix_combine %>% select(-cast, -director, date_added, -release_year, -duration) %>% rename(lead_director = "netflix_director$lead_director", cast_lead = "netflix_lead$cast_lead")
netflix_combine = netflix_combine %>% select(show_id, title, is_modern, type, lead_director, cast_lead, country, is_english, rating, listed_in, description) %>% mutate(type = ifelse(type == "Movie", 1, 0)) %>% rename(is_movie = "type")
# Split up the listed out genres so that we have individual columns represening out each group
# NOTE: some things may double up like international content which will refer to anything that isn't American content
netflix_combine = netflix_combine %>%
mutate(
international = ifelse(str_detect(listed_in, "(.+/s)?International TV Shows(.+/s)?|(.+/s)?International Movies(.+/s)?|(.+/s)?British TV Shows(.+/s)?|(.+/s)?Spanish\\-Language TV Shows(.+/s)?|(.+/s)?Korean TV Shows(.+/s)?") == T, 1, 0),
drama = ifelse(str_detect(listed_in, "(.+/s)?Dramas(.+/s)?|(.+/s)?TV Dramas(.+/s)?") == T, 1, 0),
horror = ifelse(str_detect(listed_in, "(.+/s)?Horror Movies(.+/s)?|(.+/s)?TV Horror(.+/s)?") == T, 1, 0),
action_adventure = ifelse(str_detect(listed_in, "(.+/s)?Action \\& Adventure(.+/s)?|(.+/s)?TV Action \\& Adventure(.+/s)?") == T, 1, 0),
crime = ifelse(str_detect(listed_in, "(.+/s)?Crime TV Shows(.+/s)?") == T, 1, 0),
docu = ifelse(str_detect(listed_in, "(.+/s)?Documentaries(.+/s)?|(.+/s)?Docuseries(.+/s)?|(.+/s)?Science \\& Nature TV(.+/s)?") == T, 1, 0),
comedy = ifelse(str_detect(listed_in, "(.+/s)?Comedies(.+/s)?|(.+/s)?TV Comedies(.+/s)?|(.+/s)?Stand\\-up Comedy(.+/s)?|(.+/s)?Stand\\-Up Comedy \\& Talk Shows(.+/s)?") == T, 1, 0),
anime = ifelse(str_detect(listed_in, "(.+/s)?Anime Features(.+/s)?|(.+/s)?Anime Series(.+/s)?") == T, 1, 0),
independent = ifelse(str_detect(listed_in, "(.+/s)?Independent Movies(.+/s)?") == T, 1, 0),
sports = ifelse(str_detect(listed_in, "(.+/s)?Sport Movies(.+/s)?") == T, 1, 0),
reality = ifelse(str_detect(listed_in, "(.+/s)?Reality TV(.+/s)?") == T, 1, 0),
sci_fi = ifelse(str_detect(listed_in, "(.+/s)?TV Sci\\-Fi \\& Fantasy(.+/s)?|(.+/s)?Sci\\-Fi \\& Fantasy(.+/s)?") == T, 1, 0),
family = ifelse(str_detect(listed_in, "(.+/s)?Kid\\'s TV(.+/s)?|(.+/s)?Children \\& Family Movies(.+/s)?|(.+/s)?Teen TV Shows(.+/s)?|(.+/s)?Faith \\& Spirituality(.+/s)?") == T, 1, 0),
classic = ifelse(str_detect(listed_in, "(.+/s)?Classic Movies(.+/s)?|(.+/s)?Cult Movies(.+/s)?|(.+/s)?Classic \\& Cult TV(.+/s)?") == T, 1, 0),
thriller_mystery = ifelse(str_detect(listed_in, "(.+/s)?Thrillers(.+/s)?|(.+/s)?TV Thrillers(.+/s)?|(.+/s)?TV Mysteries(.+/s)?") == T, 1, 0),
musical = ifelse(str_detect(listed_in, "(.+/s)?Music \\& Musicals(.+/s)?") == T, 1, 0),
romantic = ifelse(str_detect(listed_in, "(.+/s)?Romantic TV Shows(.+/s)?|(.+/s)?Romantic Movies(.+/s)?|(.+/s)?LGBTQ Movies(.+/s)?") == T, 1, 0)
) %>%
select(
-listed_in, -country
)
# Repeat the process for Content Ratings as well
netflix_combine = netflix_combine %>%
mutate(
tv_ma = ifelse(rating == "TV-MA", 1, 0),
r_rated = ifelse(rating == "R", 1, 0),
pg_13 = ifelse(rating == "PG-13", 1, 0),
tv_14 = ifelse(rating == "TV-14", 1, 0),
tv_pg = ifelse(rating == "TV-PG", 1, 0),
not_rated = ifelse(rating == "NR", 1,ifelse(rating == "UR",1, 0)),
tv_g = ifelse(rating == "TV-G", 1, 0),
tv_y = ifelse(rating == "TV-Y", 1, 0),
tv_y7 = ifelse(rating == "TV-Y7", 1, ifelse(rating == "TV-Y7-FV",1, 0)),
pg = ifelse(rating == "PG", 1, 0),
g_rated = ifelse(rating == "G", 1, 0),
nc_17 = ifelse(rating == "NC-17", 1, 0)
) %>%
select(
-rating
)
# TRansform the values here to binary digits for distance metric
netflix_combine = netflix_combine %>%
mutate(
is_english = ifelse(is_english == "no", 0, 1),
lead_director = ifelse(is.na(lead_director), "Unknown/No Director", lead_director),
cast_lead = ifelse(is.na(cast_lead), "Unknown/No Lead", cast_lead)
)
# LET'S FIGURE OUT HOW THE END POINT WILL WORK OUT
# I want to be able to essentially create a means where I take in a show and it gives me an output of some number of movies or shows based on my input
# From this output, I'll also like to be able to have the ability to titrate it out so that I only get a certain output that'll meet my wants.
# These will include: Genres (like A multi-select deal?), Movie vs. TV Shows vs. Doesn't Matter, Some number of recommendations
# FIRST STEP: convert certain variables into the correct data type
netflix_combine = netflix_combine %>% mutate(cast_lead = as.factor(cast_lead), lead_director = as.factor(lead_director))
# SECOND STEP: dummify the categorical variables
netflix_combine_for_knn = fastDummies::dummy_cols(netflix_combine, select_columns = c("cast_lead"), remove_selected_columns = TRUE)
# THIRD STEP: Create a dataframe + matrix used for KNN algorithm
netflix_for_matrix = netflix_combine_for_knn %>% select(-description, -title, -lead_director, -is_movie)
rownames(netflix_for_matrix) = netflix_for_matrix[, 1]
netflix_for_matrix = netflix_for_matrix %>% select(-show_id)
netflix_matrix = as.matrix(dist(netflix_for_matrix, method = "binary"))
# FOURTH STEP: Create a function to be used to generate choices
new_recommendation = function(title, data, matrix, k, reference_data){
# translate the title to show_id
show_id = reference_data$show_id[reference_data$title == title]
# create a holder vector to store findings
id = rep(0, nrow(data))
metric = rep(0, nrow(data))
content_title = reference_data$title
description = reference_data$description
country = reference_data$country
genres = reference_data$listed_in
type = reference_data$type
for(i in 1:nrow(data)) {
if(rownames(data)[i] == show_id) {
next
}
id[i] = colnames(matrix)[i]
metric[i] = matrix[show_id, i]
}
choices = cbind(as.data.frame(id), as.data.frame(content_title), as.data.frame(description), as.data.frame(metric), as.data.frame(type), as.data.frame(country), as.data.frame(genres))
choices = choices %>% arrange(metric) %>% filter(content_title != title)
choices = as.data.frame(choices)
return(choices[0:(k+1),])
}
```
```{r}
new_recommendation("Lucifer", netflix_for_matrix, netflix_matrix, 10, netflix)$content_title
```