10.3 Scraping pages that require log-in
Here is an example of scraping CBS sports fantasy baseball stats. To use this you would have to create a CBS sports log-in, and possibly a fantasy baseball league. But you should be able to easily modify it for another page that you have a log-in for. The main steps are before the for loop: userid
, password
, uastring
, login.url
, and then using session
, html_form
, html_form_set
, session_submit
.
Code
library(rvest)
library(httr)
library(tidyverse)
## You will need to change the `userid` and `password` in line 22-23.
## I recommend creating a Custom Report on the CBS player stats page
## that has the same columns as those listed in lines 91-101 below.
## You'll have to do it four times, (batting, pitching, one for each league)
## which is annoying, but worth it to scrape Salary and Contract.
## Call the custom report ForSpreadsheet.
## Otherwise, you'll have to change line 24 to report.name = 'standard'
## and edit lines 91-101 to have the columns in the standard report.
## settings
years = c(2013:2022, 'proj') ## 10 seasons, and projections
leagues = c('al', 'nl')
positions = c('h' , 'p')
userid = 'myusername'
password = 'mypassword'
report.name = 'ForSpreadsheet'
## log in to CBS sports
uastring <- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'
login.url = 'https://www.cbssports.com/login'
s<-session(login.url,
user_agent(uastring))
s
f<-html_form(s)[[1]]
f
ff<-html_form_set(f,
userid = userid,
password = password)
ff
session_submit(s,
form = ff,
submit = '_submit')
## now that your logged in, loop over year, league, and batter/pitcher
for(year in years){
cat(year, '')
for(lg in leagues){
cat(lg, '')
for(pos in positions){
cat(pos, '')
if(lg == 'al' ){site.name = 'thegbsl'}
if(lg == 'nl' ){site.name = 'ohdean'}
if(pos == 'h' ){pos.url = 'C:1B:2B:3B:SS:MI:CI:OF:DH'}
if(pos == 'p' ){pos.url = 'P'}
if(year == 'proj'){year.url = 'as-restofseason'}
if(year != 'proj'){year.url = year}
url = paste0('https://',
site.name, '.baseball.cbssports.com/stats/stats-main/all:',
pos.url, '/',
year.url, ':p/',
report.name, '?print_rows=9999')
url ## check
## go to desired page
page = session_jump_to(s, url)
page
## extract the table
d = page %>%
html_table(header = T)
dd = d[[2]] %>% ## table is in the second item of the list
as.data.frame()
## the first row should be the column names, so
## put the first row there,
## and remove the first row
## Also, remove the first column, which looks like garbage
colnames(dd) = dd[1,]
head(dd)
dd = dd[-1,] ## now remove 1st row
head(dd)
## remove last row
dd = dd[-nrow(dd), ]
## select columns and save
if(pos == 'h'){
dd = dd %>%
select(Player, Avail, Eligible, Salary, Contract,
G, AB, H, R, HR, RBI, SB, BB, TB, HP, Rank)
}
if(pos == 'p'){
dd = dd %>%
select(Player, Avail, Eligible, Salary, Contract,
APP, W, S, H, HR, ER, BB, K, INN, Rank)
}
head(dd)
filename = paste0('data/',
lg, '-stats-',
year, '-',
pos, '.csv')
filename
write.csv(dd, file = filename, row.names = F)
}
}
}