R task, web scrapping

Solution 1:

My suggestion to use year <- sub(.) needs to be put in context of the function itself, using its url only. This works.

fun_download <- function(url) {
  stopifnot(length(url) == 1L) # just a safeguard
  year <- sub(".*[^0-9]([0-9]+)\\.zip$", "\\1", url) 
  zip_file <- paste0("file_", year, ".zip")
  zip_dir <- paste0("dir_", year)
  download.file(url, zip_file)
  unzip(zip_file, exdir = zip_dir, files = "survey_results_public.csv")
  out <- readr::read_csv(file.path(zip_dir, "survey_results_public.csv"), col_types = readr::cols(.default = "c")) %>%
    mutate(
      Year = year,
      ResponseId = row_number()
    )
  return(out)
}

fun_download(lst_url[[1]])
# trying URL 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2021.zip'
# Content type 'application/zip' length 8825103 bytes (8.4 MB)
# downloaded 8.4 MB
# # A tibble: 83,439 x 49
#    ResponseId MainBranch   Employment  Country  US_State UK_Country EdLevel  Age1stCode LearnCode  YearsCode YearsCodePro DevType 
#         <int> <chr>        <chr>       <chr>    <chr>    <chr>      <chr>    <chr>      <chr>      <chr>     <chr>        <chr>   
#  1          1 I am a deve~ Independen~ Slovakia NA       NA         Seconda~ 18 - 24 y~ Coding Bo~ NA        NA           Develop~
#  2          2 I am a stud~ Student, f~ Netherl~ NA       NA         Bachelo~ 11 - 17 y~ Other onl~ 7         NA           NA      
#  3          3 I am not pr~ Student, f~ Russian~ NA       NA         Bachelo~ 11 - 17 y~ Other onl~ NA        NA           NA      
#  4          4 I am a deve~ Employed f~ Austria  NA       NA         Master?~ 11 - 17 y~ NA         NA        NA           Develop~
#  5          5 I am a deve~ Independen~ United ~ NA       England    Master?~ 5 - 10 ye~ Friend or~ 17        10           Develop~
#  6          6 I am a stud~ Student, p~ United ~ Georgia  NA         Bachelo~ 11 - 17 y~ Other onl~ NA        NA           NA      
#  7          7 I code prim~ I prefer n~ United ~ New Ham~ NA         Seconda~ 11 - 17 y~ Other onl~ 3         NA           NA      
#  8          8 I am a stud~ Student, f~ Malaysia NA       NA         Bachelo~ 11 - 17 y~ School;On~ 4         NA           NA      
#  9          9 I am a deve~ Employed p~ India    NA       NA         Bachelo~ 18 - 24 y~ Coding Bo~ 6         4            Develop~
# 10         10 I am a deve~ Employed f~ Sweden   NA       NA         Master?~ 11 - 17 y~ School     7         4            Data sc~
# # ... with 83,429 more rows, and 37 more variables: OrgSize <chr>, Currency <chr>, CompTotal <chr>, CompFreq <chr>,
# #   LanguageHaveWorkedWith <chr>, LanguageWantToWorkWith <chr>, DatabaseHaveWorkedWith <chr>, DatabaseWantToWorkWith <chr>,
# #   PlatformHaveWorkedWith <chr>, PlatformWantToWorkWith <chr>, WebframeHaveWorkedWith <chr>, WebframeWantToWorkWith <chr>,
# #   MiscTechHaveWorkedWith <chr>, MiscTechWantToWorkWith <chr>, ToolsTechHaveWorkedWith <chr>, ToolsTechWantToWorkWith <chr>,
# #   NEWCollabToolsHaveWorkedWith <chr>, NEWCollabToolsWantToWorkWith <chr>, OpSys <chr>, NEWStuck <chr>, NEWSOSites <chr>,
# #   SOVisitFreq <chr>, SOAccount <chr>, SOPartFreq <chr>, SOComm <chr>, NEWOtherComms <chr>, Age <chr>, Gender <chr>,
# #   Trans <chr>, Sexuality <chr>, Ethnicity <chr>, Accessibility <chr>, MentalHealth <chr>, SurveyLength <chr>, ...

From here, use lapply(., fun_download) to produce a list of frames.

list_of_frames <- lapply(lst_url, fun_download)
# trying URL 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2021.zip'
# Content type 'application/zip' length 8825103 bytes (8.4 MB)
# downloaded 8.4 MB
# trying URL 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2020.zip'
# Content type 'application/zip' length 9908290 bytes (9.4 MB)
# downloaded 9.4 MB
# trying URL 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2019.zip'
# Content type 'application/zip' length 18681322 bytes (17.8 MB)
# downloaded 17.8 MB
# trying URL 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2018.zip'
# Content type 'application/zip' length 20022841 bytes (19.1 MB)
# downloaded 19.1 MB
# trying URL 'https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2017.zip'
# Content type 'application/zip' length 9576818 bytes (9.1 MB)
# downloaded 9.1 MB

And a terse summary to show what they hold:

lapply(list_of_frames, function(z) z[1:2, 1:4])
# [[1]]
# # A tibble: 2 x 4
#   ResponseId MainBranch                             Employment                                           Country    
#        <int> <chr>                                  <chr>                                                <chr>      
# 1          1 I am a developer by profession         Independent contractor, freelancer, or self-employed Slovakia   
# 2          2 I am a student who is learning to code Student, full-time                                   Netherlands
# [[2]]
# # A tibble: 2 x 4
#   Respondent MainBranch                     Hobbyist Age  
#   <chr>      <chr>                          <chr>    <chr>
# 1 1          I am a developer by profession Yes      NA   
# 2 2          I am a developer by profession No       NA   
# [[3]]
# # A tibble: 2 x 4
#   Respondent MainBranch                             Hobbyist OpenSourcer            
#   <chr>      <chr>                                  <chr>    <chr>                  
# 1 1          I am a student who is learning to code Yes      Never                  
# 2 2          I am a student who is learning to code No       Less than once per year
# [[4]]
# # A tibble: 2 x 4
#   Respondent Hobby OpenSource Country       
#   <chr>      <chr> <chr>      <chr>         
# 1 1          Yes   No         Kenya         
# 2 3          Yes   Yes        United Kingdom
# [[5]]
# # A tibble: 2 x 4
#   Respondent Professional ProgramHobby Country       
#   <chr>      <chr>        <chr>        <chr>         
# 1 1          Student      Yes, both    United States 
# 2 2          Student      Yes, both    United Kingdom

If you need to assign names (such as the URL used to derive each dataset), then perhaps this, which adds a $url field to each frame.

list_of_frames <- Map(function(x, u) transform(x, url = u), list_of_frames, lst_url)

Data

library(rvest)
lst_nodes <- read_html("https://insights.stackoverflow.com/survey/") %>% 
  html_nodes(".js-download-link")
lst_url <- html_attr(lst_nodes [1:5], "href")
lst_url
# [1] "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2021.zip"
# [2] "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2020.zip"
# [3] "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2019.zip"
# [4] "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2018.zip"
# [5] "https://info.stackoverflowsolutions.com/rs/719-EMH-566/images/stack-overflow-developer-survey-2017.zip"