10.3 Scraping pages that require log-in

Here is an example of scraping CBS sports fantasy baseball stats. To use this you would have to create a CBS sports log-in, and possibly a fantasy baseball league. But you should be able to easily modify it for another page that you have a log-in for. The main steps are before the for loop: userid, password, uastring, login.url, and then using session, html_form, html_form_set, session_submit.

Code

library(rvest)
library(httr)
library(tidyverse)

## You will need to change the `userid` and `password` in line 22-23.  

## I recommend creating a Custom Report on the CBS player stats page 
## that has the same columns as those listed in lines 91-101 below. 
## You'll have to do it four times, (batting, pitching, one for each league)
## which is annoying, but worth it to scrape Salary and Contract.
## Call the custom report ForSpreadsheet.

## Otherwise, you'll have to change line 24 to report.name = 'standard'
## and edit lines 91-101 to have the columns in the standard report. 

## settings
years     = c(2013:2022, 'proj') ## 10 seasons, and projections
leagues   = c('al', 'nl')
positions = c('h' , 'p') 

userid      = 'myusername'
password    = 'mypassword'
report.name = 'ForSpreadsheet'

## log in to CBS sports
uastring <- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'

login.url = 'https://www.cbssports.com/login'

s<-session(login.url, 
           user_agent(uastring))
s

f<-html_form(s)[[1]]
f

ff<-html_form_set(f, 
                  userid   = userid, 
                  password = password)
ff

session_submit(s, 
               form   = ff, 
               submit = '_submit')


## now that your logged in, loop over year, league, and batter/pitcher

for(year in years){
  cat(year, '')
  
  for(lg in leagues){
    cat(lg, '')
    
    for(pos in positions){
      cat(pos, '')
      
      if(lg   == 'al'  ){site.name = 'thegbsl'}
      if(lg   == 'nl'  ){site.name = 'ohdean'}
      if(pos  == 'h'   ){pos.url   = 'C:1B:2B:3B:SS:MI:CI:OF:DH'}
      if(pos  == 'p'   ){pos.url   = 'P'}
      if(year == 'proj'){year.url  = 'as-restofseason'}
      if(year != 'proj'){year.url  = year}
      
      url = paste0('https://', 
                   site.name, '.baseball.cbssports.com/stats/stats-main/all:', 
                   pos.url, '/', 
                   year.url, ':p/', 
                   report.name, '?print_rows=9999')
      url ## check

      ## go to desired page
      page = session_jump_to(s, url)
      page
      
      ## extract the table
      d = page %>% 
        html_table(header = T)
      
      dd = d[[2]] %>% ## table is in the second item of the list
        as.data.frame()
      
      ## the first row should be the column names, so
      ## put the first row there,
      ## and remove the first row
      ## Also, remove the first column, which looks like garbage
      colnames(dd) = dd[1,] 
      head(dd)
      
      dd = dd[-1,] ## now remove 1st row
      head(dd)
      
      ## remove last row
      dd = dd[-nrow(dd), ]
      
      ## select columns and save
      if(pos == 'h'){
        dd = dd %>%
          select(Player, Avail, Eligible, Salary, Contract, 
               G, AB, H, R, HR, RBI, SB, BB, TB, HP, Rank) 
      }
      
      if(pos == 'p'){
        dd = dd %>%
          select(Player, Avail, Eligible, Salary, Contract, 
                 APP, W, S, H, HR, ER, BB, K, INN, Rank) 
      }
        
      head(dd)
      
      filename = paste0('data/', 
                        lg, '-stats-', 
                        year, '-', 
                        pos, '.csv')
      filename
      
      write.csv(dd, file = filename, row.names = F)
      
    }
  }
}