使用R语言中的httr包进行POST请求

4

我想使用httr从以下网址获取POST请求的输出:

http://www.e-grunt.ba

当您单击“ZK Ulošci”时,您可以看到提交表单。
在那里,我想发送POST请求并获取输出。例如,您可以从下拉窗口中选择任何内容,在“Broj Uloška”字段中输入1,然后单击“Traži”。
这是我的尝试:
library(httr)
library(tidyverse)
library(rvest)

    output <- httr::POST(
      "http://www.e-grunt.ba/home.jsf",
      body = list(
        "form:court_focus" = "440",
        "form:cuTransferLast" = "17.07.2019",
        "form:municipality_input" = "4400000001",
        "form:mpart_focus" = "44000087",
        "form:folder" = 1,
        `recaptcha-token` = "some token",
        submit = "form:j_idt61"
        ),
      add_headers(Referer = "http://www.e-grunt.ba/"),
      encode = "form",
      verbose()
    )

但是这只返回主页的内容。

我知道使用(R)Selenium更容易,但如果可能的话,我想使用 httr 和 POST 完成它。


1
是的,有验证码。我在POST请求体中有一个recaptcha-token参数。但它会随着每个请求而改变,所以我无法设置固定的密钥。 - Mislav
验证码的目的可能是为了防止网站被爬取。您是否检查过该网站的条款和条件? - QHarr
2
他们没有条款和条件。这就是我认为可以爬取该网站的原因。其次,它是公开注册的。第三,有一项法律规定政府公开可用信息是公开的。也许他们设置验证码是为了解决同时请求过多的问题。 - Mislav
2
你在这个网站上不会得到太多帮助。首先,验证码清楚地表明该网站不允许爬取数据。其次,如果你声称数据是公共记录,那么你应该直接从数据源请求所需的完整数据集,而不是试图规避他们的网站保护措施。 - Adam Sampson
1
正如之前所说,您将无法通过验证码。如果您有兴趣,我也可以向您解释您的方法中的其他问题,但这只是为了学习,我不会给您提供解决方案。即使我能够提供解决方案,我也不会这样做,因为如果提供者实施了这样的安全系统,那么正是为了防止您正在尝试的行为。 - Chelmy88
显示剩余2条评论
1个回答

0

我已经找到了一种方法来抓取这个ASP.net网站。如果有人需要类似的代码,我会提供给他们:

start_session <- function() {
  p <- html_session(
    "http://www.e-grunt.ba", 
    user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
  )
  viewState <- p %>% html_nodes("input") %>% .[[2]] %>% html_attr("value") 
  p <- rvest:::request_POST(
    p, 
    "http://www.e-grunt.ba/home.jsf",
    add_headers(
      'Referer' = 'http://www.e-grunt.ba'
    ),
    body = list(
      "javax.faces.partial.ajax" = "true",
      "javax.faces.source" = "j_idt8:j_idt15",
      "javax.faces.partial.execute" = "@all",
      "javax.faces.partial.render" = "content",
      "j_idt8:j_idt15" = "j_idt8:j_idt15",
      "j_idt8" = 'j_idt8',
      'javax.faces.ViewState' = viewState
    ),
    encode = "form"
  )
  attr(p, "viewState") <- viewState
  p
}

# EXTRACT METADATA --------------------------------------------------------

p <- start_session()
name_value_pairs <- function(html, css, cnames) {
  x <- read_html(html) %>% 
    html_nodes(css) %>% 
    html_children() %>% 
    html_attr("value")
  y <- read_html(html) %>% 
    html_nodes(css) %>% 
    html_children() %>% 
    html_text()
  df <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
  df <- df[df[, 1] != -1, ]
  colnames(df) <- cnames
  df
}
courts <- name_value_pairs(p$response$content, css = '[id="form:court_input"]', cnames = c("court_id", "court"))

metadata_post <- function(session_zk, view_state, id) {
  p <- rvest:::request_POST(
    session_zk, 
    "http://www.e-grunt.ba/home.jsf",
    add_headers(
      'Referer' = 'http://www.e-grunt.ba'
    ),
    body = list(
      'javax.faces.partial.ajax' = 'true',
      'javax.faces.source' = 'form:court',
      'javax.faces.partial.execute' = 'form:court',
      'javax.faces.partial.render' = 'msgs msgsBottom form:municipality form:mpart form:cuTransferLast',
      'javax.faces.behavior.event' = 'change',
      'javax.faces.partial.event' = 'change',
      'form' = 'form',
      'g-recaptcha-response' = '',
      'form:court_focus' = '',
      'form:court_input' = id,
      'form:cuTransferLast' = '',
      'form:municipality_focus' = '',
      'form:mpart_focus' = '',
      'form:folder' = '',
      'form:parcel' = '',
      'form:parcelSub' = '',
      'javax.faces.ViewState' = view_state
    ),
    encode = "form"
  )
  return(p)
}

muni_post <- function(session_zk, view_state, id, muni_id) {
  p <- rvest:::request_POST(
    session_zk, 
    "http://www.e-grunt.ba/home.jsf",
    add_headers(
      'Referer' = 'http://www.e-grunt.ba'
    ),
    body = list(
      'javax.faces.partial.ajax' = 'true',
      'javax.faces.source' = 'form:municipality',
      'javax.faces.partial.execute' = 'form:municipality',
      'javax.faces.partial.render' = 'msgs msgsBottom form:mpart',
      'javax.faces.behavior.event' = 'change',
      'javax.faces.partial.event' = 'change',
      'form' = 'form',
      'g-recaptcha-response' = '',
      'form:court_focus' = '',
      'form:court_input' = id,
      'form:cuTransferLast' = '',
      'form:municipality_focus' = '',
      'form:municipality_input' = muni_id,
      'form:mpart_focus' = '',
      'form:folder' = '',
      'form:parcel' = '',
      'form:parcelSub' = '',
      'javax.faces.ViewState' = view_state
    ),
    encode = "form"
  )
  return(p)
}


metadata_i <- list()
for (i in seq_along(courts$court_id)) {
  print(i)
  p <- metadata_post(p, attributes(p)$viewState, courts$court_id[i])
  muni <- name_value_pairs(p$response$content, css = '[id="form:municipality_input"]', cnames = c("muni_id", "muni"))
  
  if (nrow(muni) > 1) {
    muni_ko <- list()
    for (j in seq_along(muni$muni_id)) {
      # print(j)
      p <- muni_post(p, attributes(p)$viewState, courts$court_id[i], muni$muni_id[j])
      ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
      if (nrow(ko) == 0) {
        ko <- data.frame(ko_id = NA, ko = NA, stringsAsFactors = FALSE)
      }
      muni_ko[[j]] <-  cbind.data.frame(muni[j, ], ko, stringsAsFactors = FALSE)
    }
    metadata_i[[i]] <- cbind.data.frame(courts[i, ], do.call(rbind, muni_ko), stringsAsFactors = FALSE)
  } else {
    ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
    meta <- cbind.data.frame(courts[i, ], muni, stringsAsFactors = FALSE)
    metadata_i[[i]] <- cbind.data.frame(meta, ko, stringsAsFactors = FALSE)
  }
}
metadata <- do.call(rbind, metadata_i)

metadata_post <- function(session_zk, view_state, recaptcha, court,
                          date = as.character(format.Date(Sys.Date() - 4, "%d.%m.%Y")),
                          muni, ko, zk
) {
  p <- rvest:::request_POST(
    session_zk, 
    "http://www.e-grunt.ba/home.jsf",
    add_headers(
      'Referer' = 'http://www.e-grunt.ba'
    ),
    body = list(
      'form' = 'form',
      'g-recaptcha-response' = recaptcha,
      'form:court_focus' = '',
      'form:court_input' = court,
      'form:cuTransferLast' = date,
      'form:municipality_focus' = '',
      'form:municipality_input' = muni,
      'form:mpart_focus' = '',
      'form:mpart_input' = ko,
      'form:folder' = zk,
      'form:parcel' = '',
      'form:parcelSub' = '',
      'form:j_idt61' = '',
      'javax.faces.ViewState' = view_state
    ),
    encode = "form"
  )
  return(p)
}

# example
result <- break_captcha()
p <- metadata_post(session_zk = p, view_state = attributes(p)$viewState, 
                   recaptcha = result, court = metadata$court_id[i],
                   muni = metadata$muni_id[i], ko =  metadata$ko_id[i], zk = j)

result <- break_captcha() 怎么样? - BBB

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接