如何过滤掉从HTML文件加载的R中的表中的特定行?

时间:2017-09-28 20:15:54

标签: html r

So I downloaded this webpage, using filtering for Spring 2017 and for College of Business。我将它保存到计算机,然后使用下面的代码将其读入R.这是输出的样子I saved it to computer, then read it into R using code below. Here is what the output looks like目前看起来很糟糕。我想过滤掉V1,V2,V3,V4行,也不想要术语教师名称,课程行。我想过滤掉所有这些。我甚至不会试图询问如何使数据看起来像在网页上那样。只是想先把它们过滤掉。

以下是我的代码:

fle = "C:/Users/joey/Downloads/CourseEvaluationsCOB.HTML"

readHTMLTable(fle)
fle

This resource right here on stack shows a way to filter specific rows, but I haven't the foggiest idea how to apply this to my own data.它使用逻辑索引“d< -d [!(d $ A ==”B“& d $ E == 0),]”会喜欢做这样的事情。我编写此代码的尝试包含在下面:

d<-fle[!(fle$V1=="Term:", fle$V2=="Department:", fle$V1=="Course:" & fle$V2=="Section:"]

2 个答案:

答案 0 :(得分:2)

让我们更多地帮助你。

我正在使用这些套餐:

library(rvest)
library(httr)
library(stringi)
library(hrbrthemes)
library(tidyverse)

我们将使用此功能清理列名称:

mcga <- function(tbl) {

  x <- colnames(tbl)
  x <- tolower(x)
  x <- gsub("[[:punct:][:space:]]+", "_", x)
  x <- gsub("_+", "_", x)
  x <- gsub("(^_|_$)", "", x)
  x <- make.unique(x, sep = "_")

  colnames(tbl) <- x

  tbl

}

由于您可能需要/需要为其他表单组合执行此操作,我们将从主表单页面开始:

eval_pg <- read_html("https://opir.fiu.edu/instructor_eval.asp")

我们最终会抓取表单提交生成的实际数据,但我们需要填写表格&#34;使用选项值,让我们得到它们。

这些是Term的有效参数:

term_nodes <- html_nodes(eval_pg, "select[name='Term'] > option")
data_frame(
  name = html_text(term_nodes),
  id = html_attr(term_nodes, "value")
) -> Terms

Terms
## # A tibble: 42 x 2
##           name    id
##          <chr> <chr>
##  1 Summer 2017  1175
##  2 Spring 2017  1171
##  3   Fall 2016  1168
##  4 Summer 2016  1165
##  5 Spring 2016  1161
##  6   Fall 2015  1158
##  7 Summer 2015  1155
##  8 Spring 2015  1151
##  9   Fall 2014  1148
## 10 Summer 2014  1145
# ... with 32 more rows

这些是Coll的有效参数:

college_nodes <- html_nodes(eval_pg, "select[name='Coll'] > option")
data_frame(
  name = html_text(college_nodes),
  id = html_attr(college_nodes, "value")
) -> Coll

Coll
## # A tibble: 12 x 2
##                                      name    id
##                                     <chr> <chr>
##  1                                    All     %
##  2 Communication, Architecture & the Arts CARTA
##  3            Arts, Sciences & Education   CASE
##  4                               Business CBADM
##  5                Engineering & Computing CENGR
##  6                         Honors College HONOR
##  7       Hospitality & Tourism Management SHMGT
##  8                                    Law  CLAW
##  9              Nursing & Health Sciences  CNHS
## 10            Public Health & Social Work CPHSW
## 11         International & Public Affairs  SIPA
## 12                Undergraduate Education UGRED

像浏览器一样发出请求。该表单使用查询参数创建HTTP GET请求,从而打开一个新的浏览器选项卡/窗口。我们将使用获得的值^^:

以编程方式发出相同的请求
GET("https://opir.fiu.edu/instructor_evals/instr_eval_result.asp",
    query = list(
      Term = "1171",
      Coll = "CBADM",
      Dept = "",
      RefNum = "",
      Crse = "",
      Instr = ""
    )) -> res

report <- content(res, as="parsed", encoding="UTF-8")

report变量包含已解析的HTML / XML文档,其中包含您需要的所有数据。现在,我们将提取&amp;迭代每张桌子,然后立即将它们全部拉出来。这将让我们将元数据与每个表相关联。

我们将使用此辅助向量自动获取元数据字段:

fields <- c("Term:", "Instructor Name:", "Course:", "Department:", "Section:",
            "Ref#:", "Title:", "Completed Forms:")

找到所有表格:

tables_found <- html_nodes(report, xpath=".//table[contains(., 'Term')]")

这会设置一个进度条(操作需要1-2米):

pb <- progress_estimated(length(tables_found))

现在,我们遍历我们找到的每个表。

map(tables_found, ~{

  pb$tick()$print() # increment progress

  tab <- .x # this is just for naming sanity convenience

  # Extract the fields
  # - Iterate over each field string
  # - Find that table cell
  # - Extract the text
  # - Remove the field string
  # - Clean up whitespace

  map(fields, ~{
    html_nodes(tab, xpath=sprintf(".//td[contains(., '%s')]", .x)) %>% 
      html_text(trim = TRUE) %>% 
      stri_replace_first_regex(.x, "") %>% 
      stri_trim_both() %>% 
      as.list() %>% 
      set_names(.x)
  }) %>% 
    flatten() %>% 
    as_data_frame() %>% 
    mcga() -> table_meta

  # Extract the actual table
  # Remove cruft and just get the rows with header and data, turn it back into a table and
  # then make a data frame out of it

  html_nodes(tab, xpath=".//tr[contains(@class, 'question') or contains(@class, 'tableback')]")  %>% 
    as.character() %>% 
    paste0(collapse="") %>% 
    sprintf("<table>%s</table>", .) %>% 
    read_html() %>% 
    html_table(header=TRUE) %>% 
    .[[1]] %>% 
    mcga() -> table_vals

  # you may want to clean up % columns here

  # Associate the table values with the table metadata
  table_meta$values <-  list(table_vals)

  # return the combined table
  table_meta

}) %>% 
  bind_rows() -> scraped_tables # bind them all together

我们现在有一个漂亮,紧凑的嵌套数据框:

glimpse(scraped_tables)
## Observations: 595
## Variables: 9
## $ term            <chr> "1171 - Spring 2017", "1171 - Spring 2017", "1171 - Spring 2017", "1171 - Spring 2017", "1171...
## $ instructor_name <chr> "Elias, Desiree", "Sueiro, Alexander", "Kim, Myung Sub", "Islam, Mohammad Nazrul", "Ling, Ran...
## $ course          <chr> "ACG   2021", "ACG   2021", "ACG   2021", "ACG   2021", "ACG   2021", "ACG   2021", "ACG   20...
## $ department      <chr> "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOU...
## $ section         <chr> "RVC -1", "U01 -1", "U02 -1", "U03 -1", "U04 -1", "U05 -1", "U06 -1", "U07 -1", "RVC -1", "P8...
## $ ref             <chr> "15164 -1", "15393 -1", "15163 -1", "15345 -1", "15346 -1", "17299 -1", "17300 -1", "33841 -1...
## $ title           <chr> "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Dec...
## $ completed_forms <chr> "57", "47", "48", "43", "21", "12", "48", "31", "44", "8", "82", "43", "20", "13", "59", "12"...
## $ values          <list> [<c("Description of course objectives and assignments", "Communication of ideas and informat...

我们可以&#34;不需要&#34;一个&#34;表&#34;一次:

unnest(scraped_tables[1,])
## # A tibble: 8 x 15
##                 term instructor_name     course department section      ref         title completed_forms
##                <chr>           <chr>      <chr>      <chr>   <chr>    <chr>         <chr>           <chr>
## 1 1171 - Spring 2017  Elias, Desiree ACG   2021 SCHACCOUNT  RVC -1 15164 -1 ACC Decisions              57
## 2 1171 - Spring 2017  Elias, Desiree ACG   2021 SCHACCOUNT  RVC -1 15164 -1 ACC Decisions              57
## 3 1171 - Spring 2017  Elias, Desiree ACG   2021 SCHACCOUNT  RVC -1 15164 -1 ACC Decisions              57
## 4 1171 - Spring 2017  Elias, Desiree ACG   2021 SCHACCOUNT  RVC -1 15164 -1 ACC Decisions              57
## 5 1171 - Spring 2017  Elias, Desiree ACG   2021 SCHACCOUNT  RVC -1 15164 -1 ACC Decisions              57
## 6 1171 - Spring 2017  Elias, Desiree ACG   2021 SCHACCOUNT  RVC -1 15164 -1 ACC Decisions              57
## 7 1171 - Spring 2017  Elias, Desiree ACG   2021 SCHACCOUNT  RVC -1 15164 -1 ACC Decisions              57
## 8 1171 - Spring 2017  Elias, Desiree ACG   2021 SCHACCOUNT  RVC -1 15164 -1 ACC Decisions              57
## # ... with 7 more variables: question <chr>, no_response <chr>, excellent <chr>, very_good <chr>, good <chr>, fair <chr>,
## #   poor <chr>

专注于&#34;只是&#34;问题数据:

unnest(scraped_tables[1,]) %>% 
  select(-c(1:8))
## # A tibble: 8 x 7
##                                                   question no_response excellent very_good  good  fair  poor
##                                                      <chr>       <chr>     <chr>     <chr> <chr> <chr> <chr>
## 1         Description of course objectives and assignments        0.0%     64.9%     14.0% 14.0%  3.5%  3.5%
## 2                   Communication of ideas and information        0.0%     56.1%     17.5% 15.8%  5.3%  5.3%
## 3 Expression of expectations for performance in this class        0.0%     63.2%     12.3% 14.0%  8.8%  1.8%
## 4       Availability to assist students in or out of class        3.5%     50.9%     21.1% 10.5% 14.0%  0.0%
## 5                         Respect and concern for students        1.8%     59.6%     10.5% 14.0% 10.5%  3.5%
## 6                        Stimulation of interest in course        1.8%     52.6%     12.3% 17.5%  7.0%  8.8%
## 7                                 Facilitation of learning        0.0%     52.6%     19.3% 10.5% 10.5%  7.0%
## 8                         Overall assessment of instructor        0.0%     54.4%     15.8% 12.3% 14.0%  3.5%

或者,&#34;不需要&#34;他们都:

glimpse(unnest(scraped_tables))
## Observations: 4,760
## Variables: 15
## $ term            <chr> "1171 - Spring 2017", "1171 - Spring 2017", "1171 - Spring 2017", "1171 - Spring 2017", "1171...
## $ instructor_name <chr> "Elias, Desiree", "Elias, Desiree", "Elias, Desiree", "Elias, Desiree", "Elias, Desiree", "El...
## $ course          <chr> "ACG   2021", "ACG   2021", "ACG   2021", "ACG   2021", "ACG   2021", "ACG   2021", "ACG   20...
## $ department      <chr> "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOU...
## $ section         <chr> "RVC -1", "RVC -1", "RVC -1", "RVC -1", "RVC -1", "RVC -1", "RVC -1", "RVC -1", "U01 -1", "U0...
## $ ref             <chr> "15164 -1", "15164 -1", "15164 -1", "15164 -1", "15164 -1", "15164 -1", "15164 -1", "15164 -1...
## $ title           <chr> "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Dec...
## $ completed_forms <chr> "57", "57", "57", "57", "57", "57", "57", "57", "47", "47", "47", "47", "47", "47", "47", "47...
## $ question        <chr> "Description of course objectives and assignments", "Communication of ideas and information",...
## $ no_response     <chr> "0.0%", "0.0%", "0.0%", "3.5%", "1.8%", "1.8%", "0.0%", "0.0%", "0.0%", "0.0%", "0.0%", "2.1%...
## $ excellent       <chr> "64.9%", "56.1%", "63.2%", "50.9%", "59.6%", "52.6%", "52.6%", "54.4%", "66.0%", "59.6%", "66...
## $ very_good       <chr> "14.0%", "17.5%", "12.3%", "21.1%", "10.5%", "12.3%", "19.3%", "15.8%", "23.4%", "23.4%", "23...
## $ good            <chr> "14.0%", "15.8%", "14.0%", "10.5%", "14.0%", "17.5%", "10.5%", "12.3%", "8.5%", "8.5%", "8.5%...
## $ fair            <chr> "3.5%", "5.3%", "8.8%", "14.0%", "10.5%", "7.0%", "10.5%", "14.0%", "0.0%", "6.4%", "2.1%", "...
## $ poor            <chr> "3.5%", "5.3%", "1.8%", "0.0%", "3.5%", "8.8%", "7.0%", "3.5%", "2.1%", "2.1%", "0.0%", "0.0%...

我们也可以在这里处理%

unnest(scraped_tables) %>% 
  mutate_all(~{gsub("%", "", .x)}) %>%
  type_convert() %>% 
  select(-c(1:8))
## # A tibble: 4,760 x 7
##                                                    question no_response excellent very_good  good  fair  poor
##                                                       <chr>       <dbl>     <dbl>     <dbl> <dbl> <dbl> <dbl>
##  1         Description of course objectives and assignments         0.0      64.9      14.0  14.0   3.5   3.5
##  2                   Communication of ideas and information         0.0      56.1      17.5  15.8   5.3   5.3
##  3 Expression of expectations for performance in this class         0.0      63.2      12.3  14.0   8.8   1.8
##  4       Availability to assist students in or out of class         3.5      50.9      21.1  10.5  14.0   0.0
##  5                         Respect and concern for students         1.8      59.6      10.5  14.0  10.5   3.5
##  6                        Stimulation of interest in course         1.8      52.6      12.3  17.5   7.0   8.8
##  7                                 Facilitation of learning         0.0      52.6      19.3  10.5  10.5   7.0
##  8                         Overall assessment of instructor         0.0      54.4      15.8  12.3  14.0   3.5
##  9         Description of course objectives and assignments         0.0      66.0      23.4   8.5   0.0   2.1
## 10                   Communication of ideas and information         0.0      59.6      23.4   8.5   6.4   2.1
## # ... with 4,750 more rows

然后,你可以做一些有趣的事情:

unnest(scraped_tables) %>% 
  mutate_all(~{gsub("%", "", .x)}) %>%
  type_convert() -> scraped_tables

group_by(scraped_tables, course) %>% 
  filter(question == "Description of course objectives and assignments") %>% 
  gather(resp_cat, resp_val, no_response, excellent, very_good, good, fair, poor) %>% 
  mutate(resp_val = resp_val/100) %>%
  mutate(resp_cat = factor(resp_cat, levels=unique(resp_cat))) %>% 
  filter(resp_val > 0) %>%
  ungroup() -> description_df

ggplot(description_df, aes(resp_val)) +
  geom_density(aes(y=..count..), fill="#b2abd288", color="#2d004b") +
  scale_x_percent() +
  scale_y_comma() +
  facet_wrap(~resp_cat) +
  labs(title="Description of course objectives and assignments distribution across courses") +
  theme_ipsum_rc(grid="XY")

enter image description here

<强>更新

magrittr管道起初可能令人生畏。这是一个没有它们的版本:

library(rvest)
library(httr)
library(stringi)
library(hrbrthemes)
library(tidyverse)

mcga <- function(tbl) {

  x <- colnames(tbl)
  x <- tolower(x)
  x <- gsub("[[:punct:][:space:]]+", "_", x)
  x <- gsub("_+", "_", x)
  x <- gsub("(^_|_$)", "", x)
  x <- make.unique(x, sep = "_")

  colnames(tbl) <- x

  tbl

}

eval_pg <- read_html("https://opir.fiu.edu/instructor_eval.asp")

term_nodes <- html_nodes(eval_pg, "select[name='Term'] > option")
data_frame(
  name = html_text(term_nodes),
  id = html_attr(term_nodes, "value")
) -> Terms

Terms

college_nodes <- html_nodes(eval_pg, "select[name='Coll'] > option")
data_frame(
  name = html_text(college_nodes),
  id = html_attr(college_nodes, "value")
) -> Coll

Coll

GET("https://opir.fiu.edu/instructor_evals/instr_eval_result.asp",
    query = list(
      Term = "1171",
      Coll = "CBADM",
      Dept = "",
      RefNum = "",
      Crse = "",
      Instr = ""
    )) -> res

report <- content(res, as="parsed", encoding="UTF-8")

fields <- c("Term:", "Instructor Name:", "Course:", "Department:", "Section:",
            "Ref#:", "Title:", "Completed Forms:")

tables_found <- html_nodes(report, xpath=".//table[contains(., 'Term')]")

pb <- progress_estimated(length(tables_found))
map(tables_found, function(.x) {

  pb$tick()$print()

  tab <- .x

  map(fields, function(.x) {

   tmp_field <- html_nodes(tab, xpath=sprintf(".//td[contains(., '%s')]", .x))
   tmp_field <- html_text(tmp_field, trim = TRUE) 
   tmp_field <- stri_replace_first_regex(tmp_field, .x, "") 
   tmp_field <- stri_trim_both(tmp_field)
   tmp_field <- as.list(tmp_field)
   tmp_field <- set_names(tmp_field, .x)
   tmp_field

  }) -> tmp_meta

  tmp_meta <- flatten(tmp_meta)
  tmp_meta <- as_data_frame(tmp_meta) 

  table_meta <- mcga(tmp_meta) 

  tmp_vals <- html_nodes(tab, xpath=".//tr[contains(@class, 'question') or contains(@class, 'tableback')]")
  tmp_vals <- as.character(tmp_vals)
  tmp_vals <- paste0(tmp_vals, collapse="") 
  tmp_vals <- sprintf("<table>%s</table>", tmp_vals) 
  tmp_vals <- read_html(tmp_vals) 
  tmp_vals <- html_table(tmp_vals, header=TRUE)[[1]]

  table_vals <- mcga(tmp_vals)

  table_meta$values <-  list(table_vals)

  table_meta

}) -> list_of_tables

scraped_tables <- bind_rows(list_of_tables)

glimpse(scraped_tables)

unnest(scraped_tables[1,])

tmp_df <- unnest(scraped_tables[1,])
select(tmp_df, -c(1:8))

glimpse(unnest(scraped_tables))

tmp_df <- unnest(scraped_tables)
tmp_df <- mutate_all(tmp_df, function(.x) { gsub("%", "", .x) })
scraped_tables <- type_convert(tmp_df)

(我删除了评论,因为它们的部分仍然相同)

答案 1 :(得分:1)

不确定您的预期结果是什么,但看起来您需要进行一些模式匹配,因为这些列中的字符串包含的文本不是您在示例中使用的文本。类似的东西:

library(stringr)
d <- fle[!(str_detect(fle$V1, "Term:") | str_detect(fle$V2, "Department:") |
str_detect(fle$V1, "Course:") | str_detect(fle$V2, "Section:")), ]