我已经找到了一种方法来抓取这个ASP.net网站。如果有人需要类似的代码,我会提供给他们:
start_session <- function() {
p <- html_session(
"http://www.e-grunt.ba",
user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
)
viewState <- p %>% html_nodes("input") %>% .[[2]] %>% html_attr("value")
p <- rvest:::request_POST(
p,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
"javax.faces.partial.ajax" = "true",
"javax.faces.source" = "j_idt8:j_idt15",
"javax.faces.partial.execute" = "@all",
"javax.faces.partial.render" = "content",
"j_idt8:j_idt15" = "j_idt8:j_idt15",
"j_idt8" = 'j_idt8',
'javax.faces.ViewState' = viewState
),
encode = "form"
)
attr(p, "viewState") <- viewState
p
}
# EXTRACT METADATA
p <- start_session()
name_value_pairs <- function(html, css, cnames) {
x <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_attr("value")
y <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_text()
df <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
df <- df[df[, 1] != -1, ]
colnames(df) <- cnames
df
}
courts <- name_value_pairs(p$response$content, css = '[id="form:court_input"]', cnames = c("court_id", "court"))
metadata_post <- function(session_zk, view_state, id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:court',
'javax.faces.partial.execute' = 'form:court',
'javax.faces.partial.render' = 'msgs msgsBottom form:municipality form:mpart form:cuTransferLast',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
muni_post <- function(session_zk, view_state, id, muni_id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:municipality',
'javax.faces.partial.execute' = 'form:municipality',
'javax.faces.partial.render' = 'msgs msgsBottom form:mpart',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:municipality_input' = muni_id,
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
metadata_i <- list()
for (i in seq_along(courts$court_id)) {
print(i)
p <- metadata_post(p, attributes(p)$viewState, courts$court_id[i])
muni <- name_value_pairs(p$response$content, css = '[id="form:municipality_input"]', cnames = c("muni_id", "muni"))
if (nrow(muni) > 1) {
muni_ko <- list()
for (j in seq_along(muni$muni_id)) {
# print(j)
p <- muni_post(p, attributes(p)$viewState, courts$court_id[i], muni$muni_id[j])
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
if (nrow(ko) == 0) {
ko <- data.frame(ko_id = NA, ko = NA, stringsAsFactors = FALSE)
}
muni_ko[[j]] <- cbind.data.frame(muni[j, ], ko, stringsAsFactors = FALSE)
}
metadata_i[[i]] <- cbind.data.frame(courts[i, ], do.call(rbind, muni_ko), stringsAsFactors = FALSE)
} else {
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
meta <- cbind.data.frame(courts[i, ], muni, stringsAsFactors = FALSE)
metadata_i[[i]] <- cbind.data.frame(meta, ko, stringsAsFactors = FALSE)
}
}
metadata <- do.call(rbind, metadata_i)
metadata_post <- function(session_zk, view_state, recaptcha, court,
date = as.character(format.Date(Sys.Date() - 4, "%d.%m.%Y")),
muni, ko, zk
) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'form' = 'form',
'g-recaptcha-response' = recaptcha,
'form:court_focus' = '',
'form:court_input' = court,
'form:cuTransferLast' = date,
'form:municipality_focus' = '',
'form:municipality_input' = muni,
'form:mpart_focus' = '',
'form:mpart_input' = ko,
'form:folder' = zk,
'form:parcel' = '',
'form:parcelSub' = '',
'form:j_idt61' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
# example
result <- break_captcha()
p <- metadata_post(session_zk = p, view_state = attributes(p)$viewState,
recaptcha = result, court = metadata$court_id[i],
muni = metadata$muni_id[i], ko = metadata$ko_id[i], zk = j)
recaptcha-token
参数。但它会随着每个请求而改变,所以我无法设置固定的密钥。 - Mislav