## # A tibble: 180 x 3## title artist link ## <chr> <chr> <chr> ## 1 Untitled (1959) William Gear https://collections.ed.ac.uk/a…## 2 Abstract Brush Drawing (2018) William Joh… https://collections.ed.ac.uk/a…## 3 Portrait of H.S. (1973) William Joh… https://collections.ed.ac.uk/a…## 4 Red and Black (1976) William Joh… https://collections.ed.ac.uk/a…## 5 Untitled (Landscape) (1943) William Joh… https://collections.ed.ac.uk/a…## 6 Black Sitka (1961) William Joh… https://collections.ed.ac.uk/a…## 7 Untitled (yellow triangle) (198… Mohamed Oun… https://collections.ed.ac.uk/a…## 8 Untitled - Abstract Print of Fo… Rena R. Sim… https://collections.ed.ac.uk/a…## 9 Untitled - Two Abstract Melting… Graeme Murr… https://collections.ed.ac.uk/a…## 10 Earth Element (1972) William Joh… https://collections.ed.ac.uk/a…## # … with 170 more rows
Click here to see the code used to scrape the data.
# load packages ----------------------------------------------------------------library(tidyverse)library(rvest)# first url ## set url first_info_url <- "https://collections.ed.ac.uk/art/./record/20144?highlight=*:*"## read page at urlpage <- read_html(first_info_url)## scrape headers headers <- page %>% html_nodes("th") %>% html_text()## scrape values values <- page %>% html_nodes("td") %>% html_text() %>% str_squish()## put together in a tibble and add link to help keep track ----tibble(headers, values) %>% pivot_wider(names_from = headers, values_from = values) %>% add_column(link = first_info_url)
Whenever you’ve copied and pasted a block of code more than twice.
Whenever you’ve copied and pasted a block of code more than twice.
How many times will we need to copy and paste the code we developed to scrape additional data on each abstract art piece in the Edinburgh College of Art Collection?
Whenever you’ve copied and pasted a block of code more than twice.
How many times will we need to copy and paste the code we developed to scrape additional data on each abstract art piece in the Edinburgh College of Art Collection?
179 more times!
## set url ----first_info_url <- "https://collections.ed.ac.uk/art/./record/20144?highlight=*:*"## read page at url ----page <- read_html(first_info_url)## scrape headers ----headers <- page %>% html_nodes("th") %>% html_text()## scrape values ----values <- page %>% html_nodes("td") %>% html_text() %>% str_squish()## put together in a tibble and add link to help keep track ----tibble(headers, values) %>% pivot_wider(names_from = headers, values_from = values) %>% add_column(link = first_info_url)
## set url ----first_info_url <- "https://collections.ed.ac.uk/art/./record/20144?highlight=*:*"## read page at url ----page <- read_html(first_info_url)## scrape headers ----headers <- page %>% html_nodes("th") %>% html_text()## scrape values ----values <- page %>% html_nodes("td") %>% html_text() %>% str_squish()## put together in a tibble and add link to help keep track ----tibble(headers, values) %>% pivot_wider(names_from = headers, values_from = values) %>% add_column(link = first_info_url)
scrape_art_info <-
function
. If we had more arguments the call would look like function(x, y, z)
.scrape_art_info <- function(x){}
function
. If we had more the call would look like function(x, y, z)
.{}
block that immediately follows function(...)
.scrape_art_info <- function(x){ # code we developed earlier to scrape info # on single art piece goes here}
scrape_art_info <- function(x){ # read page at url ---- page <- read_html(x) # scrape headers ---- headers <- page %>% html_nodes("th") %>% html_text() # scrape values ---- values <- page %>% html_nodes("td") %>% html_text() %>% str_squish() # put together in a tibble and add link to help keep track ---- tibble(headers, values) %>% pivot_wider(names_from = headers, values_from = values) %>% add_column(link = x)}
scrape_art_info(uoe_art$link[1]) %>% glimpse()
## Rows: 1## Columns: 11## $ Artist <chr> "William Gear (b.1915, d.1997)"## $ Title <chr> "Untitled"## $ Date <chr> "1959"## $ Period <chr> "20th century; 1950s"## $ Description <chr> "abstract with splashes of watery blue and bright …## $ Material <chr> "acrylic paint/paint (coating)"## $ Collection <chr> "Art Collection"## $ Classification <chr> "Abstract (fine arts style); paintings (visual wor…## $ Signature <chr> "signed and dated lower right hand corner"## $ `Accession Number` <chr> "EU0975"## $ link <chr> "https://collections.ed.ac.uk/art/./record/20144?h…
scrape_art_info(uoe_art$link[2]) %>% glimpse()
## Rows: 1## Columns: 11## $ Artist <chr> "William Johnstone (b.1897, d.1981) VIAF LC"## $ Title <chr> "Abstract Brush Drawing"## $ Period <chr> "20th century"## $ Description <chr> "Abstract black wash"## $ Material <chr> "paper (fibre product); watercolour (paint)/paint …## $ Dimensions <chr> "75.5x55.8 cm"## $ Collection <chr> "Art Collection; Hope Scott Collection"## $ Classification <chr> "paintings 1901-2000; Abstract (fine arts style); …## $ Signature <chr> "Signed in red in monogram."## $ `Accession Number` <chr> "EU0165"## $ link <chr> "https://collections.ed.ac.uk/art/./record/388?hig…
function([inputs separated by commas]){ # what to do with those inputs}
scrape_page <- function(x){ # do bunch of stuff with the input... # return a tibble tibble(...)}
What is going on here?
add_2 <- function(x){ x + 2 1000}
add_2(3)
## [1] 1000
add_2(10)
## [1] 1000
"There are only two hard things in Computer Science: cache invalidation and naming things." - Phil Karlton
"There are only two hard things in Computer Science: cache invalidation and naming things." - Phil Karlton
"There are only two hard things in Computer Science: cache invalidation and naming things." - Phil Karlton
"There are only two hard things in Computer Science: cache invalidation and naming things." - Phil Karlton
snake_case
as opposed to camelCase
)"There are only two hard things in Computer Science: cache invalidation and naming things." - Phil Karlton
snake_case
as opposed to camelCase
)scrape_page
, scrape_art_info
OR str_squish
, str_trim
, str_remove
etc.)"There are only two hard things in Computer Science: cache invalidation and naming things." - Phil Karlton
snake_case
as opposed to camelCase
)scrape_page
, scrape_art_info
OR str_squish
, str_trim
, str_remove
etc.)# JUST DON'Tmean <- function(x){ x * 3 }
scrape_art_info(uoe_art$link[1])scrape_art_info(uoe_art$link[2])scrape_art_info(uoe_art$link[3])
scrape_art_info()
function on all 180 linksYou now have a function that will scrape the relevant info on art pieces given the URL of its individual info page. Where can we get a list of URLs of each of the art pieces in the collection?
You now have a function that will scrape the relevant info on art pieces given the URL of its individual info page. Where can we get a list of URLs of each of the art pieces in the collection?
From the original data frame!
uoe_art
## # A tibble: 180 x 3## title artist link ## <chr> <chr> <chr> ## 1 Untitled (1959) William Gear https://collections.ed.ac.uk/a…## 2 Abstract Brush Drawing (2018) William Joh… https://collections.ed.ac.uk/a…## 3 Portrait of H.S. (1973) William Joh… https://collections.ed.ac.uk/a…## 4 Red and Black (1976) William Joh… https://collections.ed.ac.uk/a…## 5 Untitled (Landscape) (1943) William Joh… https://collections.ed.ac.uk/a…## 6 Black Sitka (1961) William Joh… https://collections.ed.ac.uk/a…## 7 Untitled (yellow triangle) (198… Mohamed Oun… https://collections.ed.ac.uk/a…## 8 Untitled - Abstract Print of Fo… Rena R. Sim… https://collections.ed.ac.uk/a…## 9 Untitled - Two Abstract Melting… Graeme Murr… https://collections.ed.ac.uk/a…## 10 Earth Element (1972) William Joh… https://collections.ed.ac.uk/a…## # … with 170 more rows
How can we tell R to apply the scrape_art_info()
function to each link in uoe_art$link
?
How can we tell R to apply the scrape_art_info()
function to each link in uoe_art$link
?
How can we tell R to apply the scrape_art_info()
function to each link in uoe_art$link
?
How can we tell R to apply the scrape_art_info()
function to each link in uoe_art$link
?
We'll go with Option 2 for now.
Suppose we have exam 1 and exam 2 scores of 4 students stored in a list...
exam_scores <- list( exam1 <- c(80, 90, 70, 50), exam2 <- c(85, 83, 45, 60))
Suppose we have exam 1 and exam 2 scores of 4 students stored in a list...
exam_scores <- list( exam1 <- c(80, 90, 70, 50), exam2 <- c(85, 83, 45, 60))
...and we find the mean score in each exam
map(exam_scores, mean)
## [[1]]## [1] 72.5## ## [[2]]## [1] 68.25
...and suppose we want the results as a numeric (double) vector
map_dbl(exam_scores, mean)
## [1] 72.50 68.25
...and suppose we want the results as a numeric (double) vector
map_dbl(exam_scores, mean)
## [1] 72.50 68.25
...or as a character string
map_chr(exam_scores, mean)
## [1] "72.500000" "68.250000"
map_something
Functions for looping over an object and returning a value (of a specific type):
map()
- returns a listmap_lgl()
- returns a logical vectormap_int()
- returns an integer vectormap_dbl()
- returns a double vectormap_chr()
- returns a character vectormap_df()
/ map_dfr()
- returns a data frame by row bindingmap_dfc()
- returns a data frame by column bindingscrape_art_info()
functionuoe_art$link
uoe_art_info <- map_df(uoe_art$link, scrape_art_info)
## # A tibble: 180 x 14## Artist Title Date Period Description Material Collection Classification## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> ## 1 Willi… Unti… 1959 20th … abstract w… acrylic… Art Colle… Abstract (fin…## 2 Willi… Abst… <NA> 20th … Abstract b… paper (… Art Colle… paintings 190…## 3 Willi… Port… 1973 20th … Charcoal o… charcoa… Art Colle… Abstract (fin…## 4 Willi… Red … 1976 20th … Abstract b… ink/coa… Art Colle… paintings 190…## 5 Willi… Unti… 1943 20th … Abstract b… paper (… Art Colle… paintings 190…## 6 Willi… Blac… 1961 20th … Black land… canvas … Art Colle… oil paintings…## 7 Moham… Unti… 1989 20th … abstract t… acrylic… Art Colle… Abstract (fin…## 8 Rena … Unti… 1982 20th … Print in b… paper (… Art Colle… fine art; Abs…## 9 Graem… Unti… 1985… 20th … Print of a… Print Art Colle… fine art; pri…## 10 Willi… Eart… 1972 20th … Abstract b… ink/coa… Art Colle… paintings 190…## # … with 170 more rows, and 6 more variables: Signature <chr>, `Accession## # Number` <chr>, link <chr>, Dimensions <chr>, Subject <chr>, `Alternative## # Title` <chr>
## Rows: 180## Columns: 14## $ Artist <chr> "William Gear (b.1915, d.1997)", "William Johnsto…## $ Title <chr> "Untitled", "Abstract Brush Drawing", "Portrait o…## $ Date <chr> "1959", NA, "1973", "1976", "1943", "1961", "1989…## $ Period <chr> "20th century; 1950s", "20th century", "20th cent…## $ Description <chr> "abstract with splashes of watery blue and bright…## $ Material <chr> "acrylic paint/paint (coating)", "paper (fibre pr…## $ Collection <chr> "Art Collection", "Art Collection; Hope Scott Col…## $ Classification <chr> "Abstract (fine arts style); paintings (visual wo…## $ Signature <chr> "signed and dated lower right hand corner", "Sign…## $ `Accession Number` <chr> "EU0975", "EU0165", "EU0138", "EU0147", "EU0146",…## $ link <chr> "https://collections.ed.ac.uk/art/./record/20144?…## $ Dimensions <chr> NA, "75.5x55.8 cm", "45.7x40.6 cm", "77.4x58.4 cm…## $ Subject <chr> NA, NA, NA, NA, NA, NA, "abstract", NA, NA, NA, N…## $ `Alternative Title` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
uoe_art_info <- map_df(uoe_art$link, scrape_art_info)
HTTP Error 429 (Too many requests)
you might want to slow down your hits by modifying your function to slow it down by adding a random wait (sleep) time between hitting each linkscrape_art_info <- function(x){ # Sleep for randomly generated number of seconds # Generated from a uniform distribution between 0 and 1 Sys.sleep(runif(1)) # Rest of your function code goes here...}
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |