在R中使用2个不同的ID进行Web抓取

时间:2017-09-09 14:54:39

标签: r xml loops web-scraping

我可以一次成功刮掉一个id。也许作为R中的新手,将它们分开刮掉并将它们组合起来很简单,但我真的想知道我是否可以自行制作循环并自动完成。网站的格式是相同的,但每个ID都有不同的长度(但在这种情况下这不是问题)。

如果我只抓一个id,这是我的代码:

'

S1的结构如下:

library(XML)
library(rvest)
library(plyr)
library(dplyr)
library(httr)
library(data.table)
library(pipeR)
library(xml2)

url <- "XXXXXXXXXXX"
session <-html_session(url)
form  <-html_form(read_html(url))[[1]]
filled_form <- set_values(form,
                          "id" = "S1",
                          "start" = "2017-01-17",
                          "end" = "2017-02-03",
                          "Password" = "lll")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
z1 = as_list(z)
z2 <-z1[which(names(z1)=="scheduleList")]

result <- data.frame()
for (i in 2:length(z2[[1]])){               
      row <- cbind(
        teacher=z2[[1]][[1]][[1]][[1]],   
        t_id=attr(z2[[1]][[1]],"id"), 
        Date=attr(z2[[1]][[i]],"date"),     
        class=z2[[1]][[i]][[1]][[1]][[1]][[1]],
        c_id=attr(z2[[1]][[i]][[1]][[1]],"id"), 
        c_status=attr(z2[[1]][[i]][[1]][[1]],"status"),
        score=attr(z2[[1]][[i]][[1]],"id"),            
        People=z2[[1]][[i]][[1]][[1]][[2]][[1]],    
        department=z2[[1]][[i]][[1]][[1]][[3]][[1]][[1]],        
        d_id=attr(z2[[1]][[i]][[1]][[1]][[3]],"id")   
      )
      result <- rbind(result, row)
}

来自网站的S1 xml格式:

structure(list(
scheduleList = structure(list(
teacher = structure(list(name = list("Mary")), .Names = "name", id = "S1"), 
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("312c"), people = list("129"), 
department = structure(list(name = list("English")), .Names = "name", id = "302f")), 
.Names = c("name", "people", "department"), id = "312", status = "-4")), 
.Names = "class", id = "1")), 
.Names = "score", date = "2017-01-18"), 
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("316c"), people = list("87"), 
department = structure(list(name = list("English")), .Names = "name", id = "302f")), 
.Names = c("name", "people", "department"), id = "316", status = "-2")), 
.Names = "class", id = "2")), 
.Names = "score", date = "2017-01-30")), 
.Names = c("teacher", "schedule", "schedule"), from = "2017-01-17", to = "2017-02-03")), 
.Names = "scheduleList")

这是我试图做的循环:

<result status="success">
  <code>1</code>
  <note>success</note>
  <scheduleList from="2017-01-17" to="2017-02-03">
    <teacher id="S1">
      <name>Mary</name>
    </teacher>
    <schedule date="2017-01-18">
      <score id="1">
        <class id="312" status="-4">
          <name>312C</name>
          <people>129</people>
          <department id="302f">
            <name>English</name>
          </department>
        </class>
      </score>
    </schedule>
    <schedule date="2017-01-30">
      <score id="2">
        <class id="316" status="-2">
          <name>316c</name>
          <people>87</people>
          <department id="302f">
            <name>English</name>
          </department>
        </class>
      </score>
    </schedule>
  </scheduleList>
</result>

当我以无效的方式做了两次时,它完美地工作了:

    url <- "XXXXXXXXXXX"
        session <-html_session(url)
        form  <-html_form(read_html(url))[[1]]
        for (i in 1:2){
          d=c("S1","S2")
          filled_form[i] <- set_values(form,
                                       "id" = d[i],
                                       "start" = "2017-01-17",
                                       "end" = "2017-02-03",
                                       "Password" = "lll")
          s[i] <- submit_form(session,filled_form[i])
        }

    Error in filled_form[i] <- set_values(form, id = d[i],  : 
      object 'filled_form' not found

1 个答案:

答案 0 :(得分:2)

在将值存储到代码中之前,您需要在代码中创建对象filled_forms。你在原始代码中使用它们但不在循环中。

url <- "XXXXXXXXXXX"
session <-html_session(url)
form  <-html_form(read_html(url))[[1]]
filled_form <- list()
s <- list()
for (i in 1:2){
  d=c("S1","S2")
  filled_form[[i]] <- set_values(form,
                               "id" = d[i],
                               "start" = "2017-01-17",
                               "end" = "2017-02-03",
                               "Password" = "lll")
  s[[i]] <- submit_form(session,filled_form[i])
}