1. 首先加载包,创建函数getParams函数(用于从带参数的url中提取postForm的参数,带参数的url可以用Firefox的firebug中复制下来)
library(RCurl)
library(XML)
library(magrittr)
getParams <- function(url,
containURL = substr(url, 1, 4) == "http"){
str_find_first <- function(str, pattern)
regexpr(pattern, str) %>% {list(pos = as.numeric(.), len = attr(., "match.length"))}
str_split_first <- function(str, pattern){
pos <- str_find_first(str, pattern)
if (pos$pos < 0) return(set_names(list(""), str))
set_names(list(substr(str, pos$pos + pos$len, nchar(str))), substr(str, 1, pos$pos - 1))
}
url <- iconv(URLdecode(url), "utf-8", "gbk")
if (containURL){
pos <- str_find_first(url, "\\?")
if (pos$pos > 0) url <- substr(url, pos$pos + 1, nchar(url))
}
abcd <- strsplit(url,"&")[[1]]
params <- lapply(abcd, str_split_first, pattern="\\=") %>% do.call(c,.)
return(params)
}
2. 然后带上百宝箱curlhand上路,行走江湖必备
## please input your password here
user <- "kongdd"
pwd <- "****"
myHttpheader<- c(
"User-Agent" = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0",
"Accept" = "text/html,application/xhtml+xml,application/xml,application/json;q=0.9,*/*;q=0.8",
"Accept-Language" = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection"="keep-alive",
"Host" = "cos.name")
ch <- getCurlHandle()#带上百宝箱开始上路
curlSetOpt(curl = ch, ssl.verifypeer = FALSE,
followlocation = TRUE,
verbose = TRUE,
cookiejar = "cookies_cnki.txt", #cookiefile = "cookies_cnki.txt",
httpheader = myHttpheader)
url_login <- 'http://cos.name/cn/wp-login.php'
# (1) first call to initializate session. you get the session cookie
page <- getURL(url_login, curl = ch)
3. 构造提交的表单的参数
post1 <- "log=user&pwd=pwd&pTE-gB-H-f-O-c-V=WPZKteIRNC9n9nwYjh758ig-mtHo4KXjrryHb0Ag1SDIlN8TZsEWs7U6qLAUWxhUCPXuED8XoSLBtNqkQSln9ONpRCdW0YfjXEfbGUf-9Echd4sR6YIwQHLfWdLAFVra&wp-submit=%E7%99%BB%E5%BD%95&redirect_to=http%3A%2F%2Fcos.name%2Fcn%2F&testcookie=1&y-v-F-FV-A-MP-U-Rh=10809510.020100101"
params1 <- getParams(post1)
params1$log <- user
params1$pwd <- pwd
# 其他尝试
# post2 <- "http://cos.name/cn/wp-admin/admin-ajax.php?action=gdbcRetrieveToken&browserInfo=%7B%22screenWidth%22%3A1920%2C%22screenHeight%22%3A1080%2C%22engine%22%3A24%2C%22features%22%3A95%2C%22mozilla%22%3A%225.0%22%2C%22windows_nt%22%3A%2210.0%22%2C%22wow64%22%3Atrue%2C%22rv%22%3A%2250.0%22%2C%22gecko%22%3A%2220100101%22%2C%22firefox%22%3A%2250.0%22%7D&pTE-gB-H-f-O-c-V=3759969693&requestTime=1480418883349"
# params2 <- getParams(post2)
# params2$requestTime <- as.character(floor(as.numeric(Sys.time())*1000))
#
# postForm("http://cos.name/cn/wp-admin/admin-ajax.php", .params = params2, curl = ch,
# .opt = list(verbose = TRUE),
# Referer = "http://cos.name/cn/wp-login.php", style = "post")
str(params1)
List of 7
$ log : chr "kongdd"
$ pwd : chr "****"
$ pTE-gB-H-f-O-c-V : chr "WPZKteIRNC9n9nwYjh758ig-mtHo4KXjrryHb0Ag1SDIlN8TZsEWs7U6qLAUWxhUCPXuED8XoSLBtNqkQSln9ONpRCdW0YfjXEfbGUf-9Echd4sR6YIwQHLfWdLAFVr"| __truncated__
$ wp-submit : chr "登录"
$ redirect_to : chr "http://cos.name/cn/"
$ testcookie : chr "1"
$ y-v-F-FV-A-MP-U-Rh: chr "10809510.020100101"
4. 登陆并打印作者信息,如果没报错的话,你的用户名就在info中了
page <- postForm("http://cos.name/cn/wp-login.php", .params = params1, curl = ch,
.opt = list(verbose = TRUE),
Referer = "http://cos.name/cn/wp-login.php", style = "post")
## login success if your username could be find in info
info <- htmlParse(page, encoding = "utf-8") %>% {getNodeSet(., "//div[@class='bbp-logged-in']")[[1]]} %T>% print
虽然能正常登陆了,但是提交表单的参数不是很懂。
欢迎相互讨论
https://github.com/kongdd/RCurl_project/