Jessica Ayers 2023-06-22
The packages needed for analysis of the Open Movie Database API are:
httr
tidyverse
jsonlite
GGally
library(httr)
library(jsonlite)
library(tidyverse)
library(GGally)
The API used was the Open Movie Database API. Two different types of data were returned based on parameters used in the link. With the below function, a list of information corresponding to one random movie will returned along with a data frame of 10 random movies with more limited information. The data frame will be used to do the exploratory data analysis. For this API, movies with different colors in the titles will be returned for three different release years. The 6 choices are:
s
include blue, red and green
s
searches for movie titles with the provided wordy
include 2000, 2010 and 2020
y
stands for year of releaseThere are 9 total combinations of possible modifications.
contactFunction <- function(s, y){
if((s == "blue") & (y == 2000)) {
blueData <- GET("http://www.omdbapi.com/?s=blue&y=2000&apikey=ca153ce0")
blueParsed <- fromJSON(rawToChar(blueData$content))
bluedf <- data.frame(blueParsed$Search$Title, blueParsed$Search$Year, blueParsed$Search$imdbID, blueParsed$Search$Type)
blue <- bluedf %>%
as_tibble() %>%
rename(Title = blueParsed.Search.Title, Year = blueParsed.Search.Year, imdbID = blueParsed.Search.imdbID, Type = blueParsed.Search.Type)
testData <- GET("http://www.omdbapi.com/?t=blue&y=2000&apikey=ca153ce0")
randomData <- fromJSON(rawToChar(testData$content))
return(list(RandomMovie = randomData, RandomDF = blue))
}
else if((s == "blue") & (y == 2010)){
blueData <- GET("http://www.omdbapi.com/?s=blue&y=2010&apikey=ca153ce0")
blueParsed <- fromJSON(rawToChar(blueData$content))
bluedf <- data.frame(blueParsed$Search$Title, blueParsed$Search$Year, blueParsed$Search$imdbID, blueParsed$Search$Type)
blue <- bluedf %>%
as_tibble() %>%
rename(Title = blueParsed.Search.Title, Year = blueParsed.Search.Year, imdbID = blueParsed.Search.imdbID, Type = blueParsed.Search.Type)
testData <- GET("http://www.omdbapi.com/?t=blue&y=2010&apikey=ca153ce0")
randomData <- fromJSON(rawToChar(testData$content))
return(list(RandomMovie = randomData, RandomDF = blue))
}
else if((s == "blue") & (y == 2020)){
blueData <- GET("http://www.omdbapi.com/?s=blue&y=2020&apikey=ca153ce0")
blueParsed <- fromJSON(rawToChar(blueData$content))
bluedf <- data.frame(blueParsed$Search$Title, blueParsed$Search$Year, blueParsed$Search$imdbID, blueParsed$Search$Type)
blue <- bluedf %>%
as_tibble() %>%
rename(Title = blueParsed.Search.Title, Year = blueParsed.Search.Year, imdbID = blueParsed.Search.imdbID, Type = blueParsed.Search.Type)
testData <- GET("http://www.omdbapi.com/?t=blue&y=2020&apikey=ca153ce0")
randomData <- fromJSON(rawToChar(testData$content))
return(list(RandomMovie = randomData, RandomDF = blue))
}
else if ((s == "red") & (y == 2000)){
redData <- GET("http://www.omdbapi.com/?s=red&y=2000&apikey=ca153ce0")
redParsed <- fromJSON(rawToChar(redData$content))
reddf <- data.frame(redParsed$Search$Title, redParsed$Search$Year, redParsed$Search$imdbID, redParsed$Search$Type)
red <- reddf %>%
as_tibble() %>%
rename(Title = redParsed.Search.Title, Year = redParsed.Search.Year, imdbID = redParsed.Search.imdbID, Type = redParsed.Search.Type)
testData <- GET("http://www.omdbapi.com/?t=red&y=2000&apikey=ca153ce0")
randomData <- fromJSON(rawToChar(testData$content))
return(list(RandomMovie = randomData, RandomDF = red))
}
else if ((s == "red") & (y == 2010)){
redData <- GET("http://www.omdbapi.com/?s=red&y=2010&apikey=ca153ce0")
redParsed <- fromJSON(rawToChar(redData$content))
reddf <- data.frame(redParsed$Search$Title, redParsed$Search$Year, redParsed$Search$imdbID, redParsed$Search$Type)
red <- reddf %>%
as_tibble() %>%
rename(Title = redParsed.Search.Title, Year = redParsed.Search.Year, imdbID = redParsed.Search.imdbID, Type = redParsed.Search.Type)
testData <- GET("http://www.omdbapi.com/?t=red&y=2010&apikey=ca153ce0")
randomData <- fromJSON(rawToChar(testData$content))
return(list(RandomMovie = randomData, RandomDF = red))
}
else if((s == "red") & (y == 2020)){
redData <- GET("http://www.omdbapi.com/?s=red&y=2020&apikey=ca153ce0")
redParsed <- fromJSON(rawToChar(redData$content))
reddf <- data.frame(redParsed$Search$Title, redParsed$Search$Year, redParsed$Search$imdbID, redParsed$Search$Type)
red <- reddf %>%
as_tibble() %>%
rename(Title = redParsed.Search.Title, Year = redParsed.Search.Year, imdbID = redParsed.Search.imdbID, Type = redParsed.Search.Type)
testData <- GET("http://www.omdbapi.com/?t=red&y=2020&apikey=ca153ce0")
randomData <- fromJSON(rawToChar(testData$content))
return(list(RandomMovie = randomData, RandomDF = red))
}
else if((s == "green") & (y == 2000)){
greenData <- GET("http://www.omdbapi.com/?s=green&y=2000&apikey=ca153ce0")
greenParsed <- fromJSON(rawToChar(greenData$content))
greendf <- data.frame(greenParsed$Search$Title, greenParsed$Search$Year, greenParsed$Search$imdbID, greenParsed$Search$Type)
green <- greendf %>%
as_tibble() %>%
rename(Title = greenParsed.Search.Title, Year = greenParsed.Search.Year, imdbID = greenParsed.Search.imdbID, Type = greenParsed.Search.Type)
testData <- GET("http://www.omdbapi.com/?t=green&y=2000&apikey=ca153ce0")
randomData <- fromJSON(rawToChar(testData$content))
return(list(RandomMovie = randomData, RandomDF = green))
}
else if((s== "green") & (y == 2010)){
greenData <- GET("http://www.omdbapi.com/?s=green&y=2010&apikey=ca153ce0")
greenParsed <- fromJSON(rawToChar(greenData$content))
greendf <- data.frame(greenParsed$Search$Title, greenParsed$Search$Year, greenParsed$Search$imdbID, greenParsed$Search$Type)
green <- greendf %>%
as_tibble() %>%
rename(Title = greenParsed.Search.Title, Year = greenParsed.Search.Year, imdbID = greenParsed.Search.imdbID, Type = greenParsed.Search.Type)
testData <- GET("http://www.omdbapi.com/?t=green&y=2010&apikey=ca153ce0")
randomData <- fromJSON(rawToChar(testData$content))
return(list(RandomMovie = randomData, RandomDF = green))
}
else if((s == "green") & (y == 2020)){
greenData <- GET("http://www.omdbapi.com/?s=green&y=2020&apikey=ca153ce0")
greenParsed <- fromJSON(rawToChar(greenData$content))
greendf <- data.frame(greenParsed$Search$Title, greenParsed$Search$Year, greenParsed$Search$imdbID, greenParsed$Search$Type)
green <- greendf %>%
as_tibble() %>%
rename(Title = greenParsed.Search.Title, Year = greenParsed.Search.Year, imdbID = greenParsed.Search.imdbID, Type = greenParsed.Search.Type)
testData <- GET("http://www.omdbapi.com/?t=green&y=2020&apikey=ca153ce0")
randomData <- fromJSON(rawToChar(testData$content))
return(list(RandomMovie = randomData, RandomDF = green))
}
else(stop("invalid s or y argument"))
}
Some examples of random movie titles returned from the above function are:
contactFunction("blue", 2010)$RandomMovie
## $Title
## [1] "Blue Valentine"
##
## $Year
## [1] "2010"
##
## $Rated
## [1] "R"
##
## $Released
## [1] "28 Jan 2011"
##
## $Runtime
## [1] "112 min"
##
## $Genre
## [1] "Drama, Romance"
##
## $Director
## [1] "Derek Cianfrance"
##
## $Writer
## [1] "Derek Cianfrance, Joey Curtis, Cami Delavigne"
##
## $Actors
## [1] "Ryan Gosling, Michelle Williams, John Doman"
##
## $Plot
## [1] "The relationship of a contemporary married couple, charting their evolution over a span of years by cross-cutting between time periods."
##
## $Language
## [1] "English"
##
## $Country
## [1] "United States"
##
## $Awards
## [1] "Nominated for 1 Oscar. 9 wins & 56 nominations total"
##
## $Poster
## [1] "https://m.media-amazon.com/images/M/MV5BMTU4MTQ2MzA1Ml5BMl5BanBnXkFtZTcwODE3NTgwNA@@._V1_SX300.jpg"
##
## $Ratings
## Source Value
## 1 Internet Movie Database 7.3/10
## 2 Rotten Tomatoes 86%
## 3 Metacritic 81/100
##
## $Metascore
## [1] "81"
##
## $imdbRating
## [1] "7.3"
##
## $imdbVotes
## [1] "203,428"
##
## $imdbID
## [1] "tt1120985"
##
## $Type
## [1] "movie"
##
## $DVD
## [1] "10 May 2011"
##
## $BoxOffice
## [1] "$9,706,328"
##
## $Production
## [1] "N/A"
##
## $Website
## [1] "N/A"
##
## $Response
## [1] "True"
contactFunction("red", 2010)$RandomMovie
## $Title
## [1] "RED"
##
## $Year
## [1] "2010"
##
## $Rated
## [1] "PG-13"
##
## $Released
## [1] "15 Oct 2010"
##
## $Runtime
## [1] "111 min"
##
## $Genre
## [1] "Action, Comedy, Crime"
##
## $Director
## [1] "Robert Schwentke"
##
## $Writer
## [1] "Jon Hoeber, Erich Hoeber, Warren Ellis"
##
## $Actors
## [1] "Bruce Willis, Helen Mirren, Morgan Freeman"
##
## $Plot
## [1] "When his peaceful life is threatened by a high-tech assassin, former black-ops agent Frank Moses reassembles his old team in a last-ditch effort to survive and uncover his assailants."
##
## $Language
## [1] "English, Russian"
##
## $Country
## [1] "United States, China"
##
## $Awards
## [1] "4 wins & 19 nominations"
##
## $Poster
## [1] "https://m.media-amazon.com/images/M/MV5BMzg2Mjg1OTk0NF5BMl5BanBnXkFtZTcwMjQ4MTA3Mw@@._V1_SX300.jpg"
##
## $Ratings
## Source Value
## 1 Internet Movie Database 7.0/10
## 2 Rotten Tomatoes 72%
## 3 Metacritic 60/100
##
## $Metascore
## [1] "60"
##
## $imdbRating
## [1] "7.0"
##
## $imdbVotes
## [1] "315,498"
##
## $imdbID
## [1] "tt1245526"
##
## $Type
## [1] "movie"
##
## $DVD
## [1] "25 Jan 2011"
##
## $BoxOffice
## [1] "$90,380,162"
##
## $Production
## [1] "N/A"
##
## $Website
## [1] "N/A"
##
## $Response
## [1] "True"
contactFunction("green", 2010)$RandomMovie
## $Title
## [1] "Green"
##
## $Year
## [1] "2010"
##
## $Rated
## [1] "Not Rated"
##
## $Released
## [1] "25 Sep 2010"
##
## $Runtime
## [1] "80 min"
##
## $Genre
## [1] "Comedy"
##
## $Director
## [1] "Nick Gregorio"
##
## $Writer
## [1] "Nick Gregorio, Troy Kaplan"
##
## $Actors
## [1] "Nick Gregorio, Michelle Nunes, Danny Myers"
##
## $Plot
## [1] "An aspiring businessman moves to marijuana-friendly California with high hopes of opening a government legal dispensary and seeing the \"green\" roll in. But a moratorium placed on seller's licenses complicate matters."
##
## $Language
## [1] "English"
##
## $Country
## [1] "United States"
##
## $Awards
## [1] "N/A"
##
## $Poster
## [1] "https://m.media-amazon.com/images/M/MV5BMjI1ODEzMzY5MV5BMl5BanBnXkFtZTcwODg2MTM1Nw@@._V1_SX300.jpg"
##
## $Ratings
## Source Value
## 1 Internet Movie Database 7.9/10
##
## $Metascore
## [1] "N/A"
##
## $imdbRating
## [1] "7.9"
##
## $imdbVotes
## [1] "36"
##
## $imdbID
## [1] "tt1844673"
##
## $Type
## [1] "movie"
##
## $DVD
## [1] "11 Apr 2013"
##
## $BoxOffice
## [1] "N/A"
##
## $Production
## [1] "N/A"
##
## $Website
## [1] "N/A"
##
## $Response
## [1] "True"
Examples of data frames that are returned from the above function:
contactFunction("blue", 2010)$RandomDF
## # A tibble: 10 × 4
## Title Year imdbID Type
## <chr> <chr> <chr> <chr>
## 1 Blue Valentine 2010 tt1120985 movie
## 2 Blue Mountain State 2010–2011 tt1344204 series
## 3 Blue Bloods 2010– tt1595859 series
## 4 Rookie Blue 2010–2015 tt1442065 series
## 5 Red White & Blue 2010 tt1465505 movie
## 6 Behind Blue Skies 2010 tt1510918 movie
## 7 Lady Blue Shanghai 2010 tt1712185 movie
## 8 Red vs. Blue: Revelation 2010 tt1789886 movie
## 9 Beneath the Blue 2010 tt1222698 movie
## 10 Shocking Blue 2010 tt1307010 movie
contactFunction("red", 2010)$RandomDF
## # A tibble: 10 × 4
## Title Year imdbID Type
## <chr> <chr> <chr> <chr>
## 1 RED 2010 tt1245526 movie
## 2 Batman: Under the Red Hood 2010 tt1569923 movie
## 3 Red Dead Redemption 2010 tt1479962 game
## 4 Red Hill 2010 tt1530983 movie
## 5 Red White & Blue 2010 tt1465505 movie
## 6 Red Dead Redemption: Undead Nightmare 2010 tt1806999 game
## 7 Red: Werewolf Hunter 2010 tt1626201 movie
## 8 Red vs. Blue: Revelation 2010 tt1789886 movie
## 9 Red Eagle 2010 tt1743711 movie
## 10 Red Nights 2010 tt1401668 movie
contactFunction("green", 2010)$RandomDF
## # A tibble: 10 × 4
## Title Year imdbID Type
## <chr> <chr> <chr> <chr>
## 1 Green Zone 2010 tt0947810 movie
## 2 Green Arrow 2010 tt1663633 movie
## 3 The Green Room with Paul Provenza 2010–2011 tt1546139 series
## 4 The Green Wave 2010 tt1667130 movie
## 5 Anne of Green Gables: Road to Green Gables 2010 tt2555422 movie
## 6 The Green Lantern 2010 tt2283864 movie
## 7 Inside 'The Green Zone' 2010 tt1706606 movie
## 8 Green Days 2010 tt2041470 movie
## 9 Green Crayons 2010 tt1699129 movie
## 10 Green 2010 tt1844673 movie
An exploratory data analysis was done using three endpoints. Three data frames are returned for movie titles containing the word “red” for years 2000, 2010, and 2020. A combined data frame was created and printed.
redData1 <- contactFunction(s = "red", y = 2000)$RandomDF
redData2 <- contactFunction(s = "red", y = 2010)$RandomDF
redData3 <- contactFunction(s = "red", y = 2020)$RandomDF
half <- full_join(redData1, redData2)
edaData <- full_join(half, redData3)
edaData
## # A tibble: 30 × 4
## Title Year imdbID Type
## <chr> <chr> <chr> <chr>
## 1 Red Planet 2000 tt0199753 movie
## 2 Clifford the Big Red Dog 2000–2003 tt0233041 series
## 3 Command & Conquer: Red Alert 2 2000 tt0252338 game
## 4 Agent Red 2000 tt0218080 movie
## 5 Red Dirt 2000 tt0160749 movie
## 6 Red Hot Chili Peppers: Californication 2000 tt6720730 movie
## 7 Red Letters 2000 tt0217758 movie
## 8 Red Room 2 2000 tt0371945 movie
## 9 Rocket's Red Glare 2000 tt0219279 movie
## 10 Red Hot Chili Peppers: Otherside 2000 tt6720738 movie
## # ℹ 20 more rows
A new character variable was created to determine if a movie was a sequel or not. In addition, since all character data was returned from the API, a numeric variable was created from the length of each program title. An updated tibble was printed.
#Create new variable
#initialize Sequel Variable
edaData$Sequel <- ""
for(i in 1:30){
#greply used to determine if string contains a 2
if(grepl("2", edaData$Title[i]) == "TRUE"){
edaData$Sequel[i] = "Yes"
}
else {
edaData$Sequel[i] = "No"
}
}
#initialize length of title variable
edaData$lengthOfTitle <- 0
for(i in 1:30){
#nchar used to determine length of string
edaData$lengthOfTitle[i] <- nchar(edaData$Title[i])
}
edaData
## # A tibble: 30 × 6
## Title Year imdbID Type Sequel lengthOfTitle
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Red Planet 2000 tt0199… movie No 10
## 2 Clifford the Big Red Dog 2000–2003 tt0233… seri… No 24
## 3 Command & Conquer: Red Alert 2 2000 tt0252… game Yes 30
## 4 Agent Red 2000 tt0218… movie No 9
## 5 Red Dirt 2000 tt0160… movie No 8
## 6 Red Hot Chili Peppers: Californication 2000 tt6720… movie No 38
## 7 Red Letters 2000 tt0217… movie No 11
## 8 Red Room 2 2000 tt0371… movie Yes 10
## 9 Rocket's Red Glare 2000 tt0219… movie No 18
## 10 Red Hot Chili Peppers: Otherside 2000 tt6720… movie No 32
## # ℹ 20 more rows
A one-way contingency table was created for each qualitative variable.
#contingency tables
table(edaData$Year)
##
## 2000 2000–2003 2010 2020 2020–2021 2020–2022
## 9 1 10 8 1 1
table(edaData$Type)
##
## game movie series
## 3 24 3
table(edaData$Sequel)
##
## No Yes
## 28 2
From this data, two movies were sequels. There were 3 games, 24 movies, and 3 series.
A two-way contingency table was created for Type and Year.
table(edaData$Type, edaData$Year)
##
## 2000 2000–2003 2010 2020 2020–2021 2020–2022
## game 1 0 2 0 0 0
## movie 8 0 8 8 0 0
## series 0 1 0 0 1 1
There was 1 game released in 2000, 2 in 2010, and 0 in 2020.
A three-way table was created for Type, Year, and Sequel.
table(edaData$Type, edaData$Year, edaData$Sequel)
## , , = No
##
##
## 2000 2000–2003 2010 2020 2020–2021 2020–2022
## game 0 0 2 0 0 0
## movie 7 0 8 8 0 0
## series 0 1 0 0 1 1
##
## , , = Yes
##
##
## 2000 2000–2003 2010 2020 2020–2021 2020–2022
## game 1 0 0 0 0 0
## movie 1 0 0 0 0 0
## series 0 0 0 0 0 0
For programs that are sequels, there was 1 game and 1 movie, both released in 2000.
Next, numerical summaries were created for the lengthOfTitle quantitative variable.
#numerical summaries for quantitative variables at each setting of the categorical variables
mean(edaData$lengthOfTitle)
## [1] 17.33333
median(edaData$lengthOfTitle)
## [1] 14.5
sd(edaData$lengthOfTitle)
## [1] 9.46257
var(edaData$lengthOfTitle)
## [1] 89.54023
IQR(edaData$lengthOfTitle)
## [1] 14
The mean length of title from this data was 17.3333333. The median length of title was 14.5. More summary statistics are given above.
The mean, median, and variance of length of title were then found for each grouping of Year, Type, and Sequel.
#each setting of year first
edaData %>% group_by(Year) %>%
summarise(avg = mean(lengthOfTitle), med = median(lengthOfTitle), var = var(lengthOfTitle))
## # A tibble: 6 × 4
## Year avg med var
## <chr> <dbl> <dbl> <dbl>
## 1 2000 18.4 11 137.
## 2 2000–2003 24 24 NA
## 3 2010 17.2 17.5 104.
## 4 2020 17.1 15 59.0
## 5 2020–2021 9 9 NA
## 6 2020–2022 12 12 NA
#each setting of type
edaData %>% group_by(Type) %>%
summarise(avg = mean(lengthOfTitle), med = median(lengthOfTitle), var = var(lengthOfTitle))
## # A tibble: 3 × 4
## Type avg med var
## <chr> <dbl> <dbl> <dbl>
## 1 game 28.7 30 82.3
## 2 movie 16.2 13 81.5
## 3 series 15 12 63
#each setting of Sequel
edaData %>% group_by(Sequel) %>%
summarise(avg = mean(lengthOfTitle), med = median(lengthOfTitle), var = var(lengthOfTitle))
## # A tibble: 2 × 4
## Sequel avg med var
## <chr> <dbl> <dbl> <dbl>
## 1 No 17.1 14.5 88.2
## 2 Yes 20 20 200
NA’s were created as a result of only one program being released during a series of years.
A bar plot was created to show the count of programs per year/year interval at each type level.
#bar plot
g <- ggplot(data = edaData, aes(x = Year))
g + geom_bar(aes(fill = Type)) +
labs(title = "Bar Plot of Years of Movies, Series, and Games") +
coord_flip()
From the above plot, no games were released during 2020. The most games were released in 2010. In addition, no tv series were released in 2010. All three years had the same average number of releases for movies.
Next a histogram was created.
#histogram
g <- ggplot(edaData, aes(x = lengthOfTitle))
g + geom_histogram(color = "black", fill = "white",
size = 2, binwidth = 3) +
labs(title = "Histogram of Length of Title", x = "Length of Title", y = "Count")
The above histogram shows that less programs had longer title lengths. Most title lengths were between 5 and 20 letters long.
A boxplot for each type was created.
#Boxplot
g <- ggplot(edaData, aes(x = Type, y = lengthOfTitle))
g + geom_boxplot(fill = "red") +
labs(y = "Length of Title", title = "Boxplot for Length of Title at Each Type Level")
The average length of title for games was larger than movies and series. The average length of title appears to be equivalent for movies and series, with movies having a noticeable outlying value.
More boxplots were created for each year with the median trend line shown for each type.
g <- ggplot(edaData, aes(x = Year, y = lengthOfTitle))
g + geom_boxplot(fill = "red") +
stat_summary(fun.y = median, geom = "line",
lwd = 1.5, aes(group = Type, col = Type)) +
labs(y = "Length of Tile", title = "Boxplots for Year with Median Lines")
The median length of title increases for movies from 2000 to 2020. The median length of title decreases slightly for games from 2000 to 2010. The median length of title for series fluctuates between the interval of years.
A scatterplot was created for length of title per each imdbID.
#scatterplot
g <- ggplot(edaData, aes(x = lengthOfTitle, y = imdbID))
g + geom_point() +
facet_grid(~edaData$Type) +
geom_text(aes(label = Year)) +
labs(y = "imdbID", x = "Length of Title", title = "Scatterplot of Length of Title for Each IMDBID")
The majority of imdbID’s corresponded to movie titles. Most movie titles had title lengths with less than 20 characters. Each imdbID is labeled with the release year.
Lastly, a line plot was created to show length of title for sequels vs non-sequels.
g <- ggplot(edaData, aes(x = Sequel, y = lengthOfTitle, color = Year))
g + geom_line(lwd = 4) +
labs(y = "Length of Title", title = "Length of Title for Sequels vs Non-Sequels")
From the above plot, sequels were found in only 2000 with an average title length. These lengths are comparable to non-sequels from 2020. Non-sequels from 2010 appear to have the more extreme title lengths.