Assignment 1: Solutions
Read the data
library("readr")
df = read_csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv")
Question 1: Create a new dataframe of the stock of paintings at MOMA for each month in the year
library("dplyr")
library("lubridate")
library("zoo")
df.q1 = df %>%
mutate(
year = year(DateAcquired),
month = month(DateAcquired),
date = ymd(paste(year, month, "01",sep = "-"))
) %>%
group_by(date) %>%
summarise(
supply = n()
) %>%
arrange(date) %>%
mutate(
stock = cumsum(supply)
)
head(df.q1)
## Source: local data frame [6 x 3]
##
## date supply stock
## (time) (int) (int)
## 1 1929-11-01 9 9
## 2 1930-01-01 3 12
## 3 1930-04-01 2 14
## 4 1930-06-01 1 15
## 5 1930-10-01 2 17
## 6 1931-01-01 2 19
Question 2: Use ggplot2
and your new data frame to plot the the stock of paintings on the y-axis and the date on the x-axis
library("zoo")
library("ggplot2")
library("ggthemes")
library("scales")
library("viridis")
p = ggplot(df.q1, aes(x = date, y = stock))
p + geom_line(colour = "red", size = 1) +
theme_tufte() +
theme(axis.title = element_text(),
axis.title.y = element_text(angle = 90)) +
labs("stock of paintings", title = "Question 2", x = NULL) +
scale_y_continuous(labels=comma)
Question 3: Create the same plot but this time the color should reflect the stock of paintings for curator approved and non-curator approved paintings, respectively
df.q3 = df %>%
mutate(
year = year(DateAcquired),
month = month(DateAcquired),
date = ymd(paste(year, month, "01",sep = "-"))
) %>%
group_by(date, CuratorApproved) %>%
summarise(
supply = n()
) %>%
ungroup() %>%
group_by(CuratorApproved) %>%
arrange(CuratorApproved, date) %>%
mutate(
stock = cumsum(supply)
)
head(df.q3)
## Source: local data frame [6 x 4]
## Groups: CuratorApproved [1]
##
## date CuratorApproved supply stock
## (time) (chr) (int) (int)
## 1 1931-01-01 N 1 1
## 2 1932-12-01 N 1 2
## 3 1933-01-01 N 1 3
## 4 1933-04-01 N 90 93
## 5 1934-04-01 N 14 107
## 6 1934-05-01 N 65 172
p = ggplot(df.q3, aes(x = date, y = stock, colour = CuratorApproved))
p + geom_line(size = 1) +
theme_tufte() +
theme(axis.title = element_text(),
axis.title.y = element_text(angle = 90)) +
labs("stock of paintings", title = "Question 3", x = NULL) +
scale_y_continuous(labels=comma) +
scale_color_viridis(discrete = TRUE)
Question 4: Create a new data frame of the stock of paintings grouped by what department the painting belongs to
df.q4 = df %>%
mutate(
year = year(DateAcquired),
month = month(DateAcquired),
date = ymd(paste(year, month, "01",sep = "-"))
) %>%
group_by(date, Department) %>%
summarise(
supply = n()
) %>%
ungroup() %>%
group_by(Department) %>%
arrange(Department, date) %>%
mutate(
stock = cumsum(supply)
)
head(df.q4)
## Source: local data frame [6 x 4]
## Groups: Department [1]
##
## date Department supply stock
## (time) (chr) (int) (int)
## 1 1932-01-01 Architecture & Design 2 2
## 2 1934-01-01 Architecture & Design 2 4
## 3 1934-04-01 Architecture & Design 43 47
## 4 1934-09-01 Architecture & Design 4 51
## 5 1935-11-01 Architecture & Design 22 73
## 6 1935-12-01 Architecture & Design 1 74
Question 5: Plot this data frame using ggplot2
. Which department has had the highest increase in their stock of paintings?
p = ggplot(df.q4, aes(x = date, y = stock, colour = Department))
p + geom_line(size = 1) +
theme_tufte() +
theme(axis.title = element_text(),
axis.title.y = element_text(angle = 90)) +
labs("stock of paintings", title = "Question 5", x = NULL) +
scale_y_continuous(labels=comma) +
scale_color_viridis(discrete = TRUE)
p = ggplot(df.q4, aes(x = date, y = stock, colour = Department))
p + geom_line(size = 1) +
theme_tufte() +
theme(axis.title = element_text(),
axis.title.y = element_text(angle = 90)) +
labs("stock of paintings", title = "Question 5", x = NULL) +
scale_y_log10() +
scale_color_viridis(discrete = TRUE)
# Alternative:
df.q4.alt = df %>%
group_by(Department) %>%
summarise(
stock = n()
)
head(df.q4.alt)
## Source: local data frame [6 x 2]
##
## Department stock
## (chr) (int)
## 1 Architecture & Design 15828
## 2 Architecture & Design - Image Archive 18
## 3 Drawings 10738
## 4 Film 2587
## 5 Fluxus Collection 2547
## 6 Media and Performance Art 2350
p = ggplot(df.q4.alt, aes(x = Department, y = stock, fill = Department))
p + geom_bar(stat="identity") +
theme_tufte() +
scale_y_continuous(labels=comma) +
theme(axis.text.x = element_blank(),
legend.position = "none") +
facet_wrap(~ Department, scales = "free_x") +
scale_fill_viridis(discrete = TRUE)
Question 6: Write a piece of code that counts the number of paintings by each artist in the data set. List the 10 painters with the highest number of paintings in MOMA’s collection.
df.artist = df %>%
filter(Artist != "") %>%
group_by(Artist) %>%
summarise(count = n()) %>%
arrange(-count)
head(df.artist, 10)
## Source: local data frame [10 x 2]
##
## Artist count
## (chr) (int)
## 1 Eugène Atget 5050
## 2 Louise Bourgeois 3224
## 3 Ludwig Mies van der Rohe 2497
## 4 Unknown photographer 1573
## 5 Jean Dubuffet 1426
## 6 Lee Friedlander 1317
## 7 Pablo Picasso 1309
## 8 Marc Chagall 1146
## 9 Henri Matisse 1064
## 10 Pierre Bonnard 940
Question 7: The variable ArtistBio
lists the birth place of each painter. Use this information to create a world map where each country is colored according to the stock of paintings in MOMA’s collection.
library("stringr")
df$Nationality = str_extract(df$ArtistBio, "[A-Z][a-z]+")
df.nationality = df %>%
group_by(Nationality) %>%
summarise(count = n())
head(df.nationality)
## Source: local data frame [6 x 2]
##
## Nationality count
## (chr) (int)
## 1 Active 3
## 2 Afghan 1
## 3 Albanian 23
## 4 Algerian 5
## 5 American 54031
## 6 Anglo 4
Information on country and country adjective available here
Scrape and create data frame
library("rvest")
link = "https://www.englishclub.com/vocabulary/world-countries-nationality.htm"
css.selector = "td:nth-child(2) , td:nth-child(1)"
country = link %>%
read_html() %>%
html_nodes(css = "td:nth-child(1)") %>%
html_text()
adjective = link %>%
read_html() %>%
html_nodes(css = "td:nth-child(2)") %>%
html_text()
df.info = data.frame(country = country, adjective = adjective)
df.info$adjective = tolower(df.info$adjective)
df.info$adjective = ifelse(df.info$country == "United States of America (USA)",
"american", df.info$adjective)
head(df.info)
## country adjective
## 1 Afghanistan afghan
## 2 Albania albanian
## 3 Algeria algerian
## 4 Andorra andorran
## 5 Angola angolan
## 6 Argentina argentinian
df.nationality$Nationality = tolower(df.nationality$Nationality)
df.map = inner_join(df.nationality, df.info, by = c("Nationality" = "adjective"))
library("ggmap")
world.map = map_data("world")
library("countrycode")
world.map$iso2c = countrycode(world.map$region,
origin = "country.name",
destination = "iso2c")
df.map$iso2c = countrycode(df.map$country,
origin = "country.name",
destination = "iso2c")
df.map = inner_join(world.map, df.map, by = "iso2c")
p = ggplot(df.map, aes(x = long, y = lat, group = group, fill = count))
p + geom_polygon() +
theme_tufte() +
labs(title = "Painters by country") +
scale_fill_viridis()
p = ggplot(df.map, aes(x = long, y = lat, group = group, fill = count))
p + geom_polygon() +
theme_tufte() +
labs(title = "Painters by country") +
scale_fill_viridis(trans = "log", breaks = c(20, 8000),
labels = c("low", "high"),
name = "count\n(log transformed)")
Question 8: The Dimensions
variable lists the dimensions of each painting. Use your data manipulation skills to calculate the area of each painting (in cm’s). Create a data frame of the five largest and five smallest paintings in MOMA’s collection.
dim = str_extract(df$Dimensions, "\\([^()]+\\)")
dim = gsub("\\(|\\)", "", dim)
dim = gsub("[a-z]", "", dim)
dim = str_trim(dim)
dim = str_split(dim, " ")
df$width = unlist(lapply(dim, function(x) x[1]))
df$length = unlist(lapply(dim, function(x) x[2]))
df$dim.length = unlist(lapply(dim, length))
df$height = NA
df$height[df$dim.length == 3] = unlist(lapply(dim[df$dim.length == 3],
function(x) x[3]))
df = df %>%
mutate(
width = as.numeric(width),
length = as.numeric(length),
height = as.numeric(height),
area = ifelse(dim.length == 3, width*length*height, width*length)
)
summary(df$area)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000e+00 3.530e+02 7.300e+02 9.604e+04 2.134e+03 1.419e+09 35288