So I downloaded this webpage, using filtering for Spring 2017 and for College of Business。我将它保存到计算机,然后使用下面的代码将其读入R.这是输出的样子目前看起来很糟糕。我想过滤掉V1,V2,V3,V4行,也不想要术语教师名称,课程行。我想过滤掉所有这些。我甚至不会试图询问如何使数据看起来像在网页上那样。只是想先把它们过滤掉。
以下是我的代码:
fle = "C:/Users/joey/Downloads/CourseEvaluationsCOB.HTML"
readHTMLTable(fle)
fle
This resource right here on stack shows a way to filter specific rows, but I haven't the foggiest idea how to apply this to my own data.它使用逻辑索引“d< -d [!(d $ A ==”B“& d $ E == 0),]”会喜欢做这样的事情。我编写此代码的尝试包含在下面:
d<-fle[!(fle$V1=="Term:", fle$V2=="Department:", fle$V1=="Course:" & fle$V2=="Section:"]
答案 0 :(得分:2)
让我们更多地帮助你。
我正在使用这些套餐:
library(rvest)
library(httr)
library(stringi)
library(hrbrthemes)
library(tidyverse)
我们将使用此功能清理列名称:
mcga <- function(tbl) {
x <- colnames(tbl)
x <- tolower(x)
x <- gsub("[[:punct:][:space:]]+", "_", x)
x <- gsub("_+", "_", x)
x <- gsub("(^_|_$)", "", x)
x <- make.unique(x, sep = "_")
colnames(tbl) <- x
tbl
}
由于您可能需要/需要为其他表单组合执行此操作,我们将从主表单页面开始:
eval_pg <- read_html("https://opir.fiu.edu/instructor_eval.asp")
我们最终会抓取表单提交生成的实际数据,但我们需要填写表格&#34;使用选项值,让我们得到它们。
这些是Term
的有效参数:
term_nodes <- html_nodes(eval_pg, "select[name='Term'] > option")
data_frame(
name = html_text(term_nodes),
id = html_attr(term_nodes, "value")
) -> Terms
Terms
## # A tibble: 42 x 2
## name id
## <chr> <chr>
## 1 Summer 2017 1175
## 2 Spring 2017 1171
## 3 Fall 2016 1168
## 4 Summer 2016 1165
## 5 Spring 2016 1161
## 6 Fall 2015 1158
## 7 Summer 2015 1155
## 8 Spring 2015 1151
## 9 Fall 2014 1148
## 10 Summer 2014 1145
# ... with 32 more rows
这些是Coll
的有效参数:
college_nodes <- html_nodes(eval_pg, "select[name='Coll'] > option")
data_frame(
name = html_text(college_nodes),
id = html_attr(college_nodes, "value")
) -> Coll
Coll
## # A tibble: 12 x 2
## name id
## <chr> <chr>
## 1 All %
## 2 Communication, Architecture & the Arts CARTA
## 3 Arts, Sciences & Education CASE
## 4 Business CBADM
## 5 Engineering & Computing CENGR
## 6 Honors College HONOR
## 7 Hospitality & Tourism Management SHMGT
## 8 Law CLAW
## 9 Nursing & Health Sciences CNHS
## 10 Public Health & Social Work CPHSW
## 11 International & Public Affairs SIPA
## 12 Undergraduate Education UGRED
像浏览器一样发出请求。该表单使用查询参数创建HTTP GET
请求,从而打开一个新的浏览器选项卡/窗口。我们将使用获得的值^^:
GET("https://opir.fiu.edu/instructor_evals/instr_eval_result.asp",
query = list(
Term = "1171",
Coll = "CBADM",
Dept = "",
RefNum = "",
Crse = "",
Instr = ""
)) -> res
report <- content(res, as="parsed", encoding="UTF-8")
report
变量包含已解析的HTML / XML文档,其中包含您需要的所有数据。现在,我们将提取&amp;迭代每张桌子,然后立即将它们全部拉出来。这将让我们将元数据与每个表相关联。
我们将使用此辅助向量自动获取元数据字段:
fields <- c("Term:", "Instructor Name:", "Course:", "Department:", "Section:",
"Ref#:", "Title:", "Completed Forms:")
找到所有表格:
tables_found <- html_nodes(report, xpath=".//table[contains(., 'Term')]")
这会设置一个进度条(操作需要1-2米):
pb <- progress_estimated(length(tables_found))
现在,我们遍历我们找到的每个表。
map(tables_found, ~{
pb$tick()$print() # increment progress
tab <- .x # this is just for naming sanity convenience
# Extract the fields
# - Iterate over each field string
# - Find that table cell
# - Extract the text
# - Remove the field string
# - Clean up whitespace
map(fields, ~{
html_nodes(tab, xpath=sprintf(".//td[contains(., '%s')]", .x)) %>%
html_text(trim = TRUE) %>%
stri_replace_first_regex(.x, "") %>%
stri_trim_both() %>%
as.list() %>%
set_names(.x)
}) %>%
flatten() %>%
as_data_frame() %>%
mcga() -> table_meta
# Extract the actual table
# Remove cruft and just get the rows with header and data, turn it back into a table and
# then make a data frame out of it
html_nodes(tab, xpath=".//tr[contains(@class, 'question') or contains(@class, 'tableback')]") %>%
as.character() %>%
paste0(collapse="") %>%
sprintf("<table>%s</table>", .) %>%
read_html() %>%
html_table(header=TRUE) %>%
.[[1]] %>%
mcga() -> table_vals
# you may want to clean up % columns here
# Associate the table values with the table metadata
table_meta$values <- list(table_vals)
# return the combined table
table_meta
}) %>%
bind_rows() -> scraped_tables # bind them all together
我们现在有一个漂亮,紧凑的嵌套数据框:
glimpse(scraped_tables)
## Observations: 595
## Variables: 9
## $ term <chr> "1171 - Spring 2017", "1171 - Spring 2017", "1171 - Spring 2017", "1171 - Spring 2017", "1171...
## $ instructor_name <chr> "Elias, Desiree", "Sueiro, Alexander", "Kim, Myung Sub", "Islam, Mohammad Nazrul", "Ling, Ran...
## $ course <chr> "ACG 2021", "ACG 2021", "ACG 2021", "ACG 2021", "ACG 2021", "ACG 2021", "ACG 20...
## $ department <chr> "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOU...
## $ section <chr> "RVC -1", "U01 -1", "U02 -1", "U03 -1", "U04 -1", "U05 -1", "U06 -1", "U07 -1", "RVC -1", "P8...
## $ ref <chr> "15164 -1", "15393 -1", "15163 -1", "15345 -1", "15346 -1", "17299 -1", "17300 -1", "33841 -1...
## $ title <chr> "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Dec...
## $ completed_forms <chr> "57", "47", "48", "43", "21", "12", "48", "31", "44", "8", "82", "43", "20", "13", "59", "12"...
## $ values <list> [<c("Description of course objectives and assignments", "Communication of ideas and informat...
我们可以&#34;不需要&#34;一个&#34;表&#34;一次:
unnest(scraped_tables[1,])
## # A tibble: 8 x 15
## term instructor_name course department section ref title completed_forms
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1171 - Spring 2017 Elias, Desiree ACG 2021 SCHACCOUNT RVC -1 15164 -1 ACC Decisions 57
## 2 1171 - Spring 2017 Elias, Desiree ACG 2021 SCHACCOUNT RVC -1 15164 -1 ACC Decisions 57
## 3 1171 - Spring 2017 Elias, Desiree ACG 2021 SCHACCOUNT RVC -1 15164 -1 ACC Decisions 57
## 4 1171 - Spring 2017 Elias, Desiree ACG 2021 SCHACCOUNT RVC -1 15164 -1 ACC Decisions 57
## 5 1171 - Spring 2017 Elias, Desiree ACG 2021 SCHACCOUNT RVC -1 15164 -1 ACC Decisions 57
## 6 1171 - Spring 2017 Elias, Desiree ACG 2021 SCHACCOUNT RVC -1 15164 -1 ACC Decisions 57
## 7 1171 - Spring 2017 Elias, Desiree ACG 2021 SCHACCOUNT RVC -1 15164 -1 ACC Decisions 57
## 8 1171 - Spring 2017 Elias, Desiree ACG 2021 SCHACCOUNT RVC -1 15164 -1 ACC Decisions 57
## # ... with 7 more variables: question <chr>, no_response <chr>, excellent <chr>, very_good <chr>, good <chr>, fair <chr>,
## # poor <chr>
专注于&#34;只是&#34;问题数据:
unnest(scraped_tables[1,]) %>%
select(-c(1:8))
## # A tibble: 8 x 7
## question no_response excellent very_good good fair poor
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Description of course objectives and assignments 0.0% 64.9% 14.0% 14.0% 3.5% 3.5%
## 2 Communication of ideas and information 0.0% 56.1% 17.5% 15.8% 5.3% 5.3%
## 3 Expression of expectations for performance in this class 0.0% 63.2% 12.3% 14.0% 8.8% 1.8%
## 4 Availability to assist students in or out of class 3.5% 50.9% 21.1% 10.5% 14.0% 0.0%
## 5 Respect and concern for students 1.8% 59.6% 10.5% 14.0% 10.5% 3.5%
## 6 Stimulation of interest in course 1.8% 52.6% 12.3% 17.5% 7.0% 8.8%
## 7 Facilitation of learning 0.0% 52.6% 19.3% 10.5% 10.5% 7.0%
## 8 Overall assessment of instructor 0.0% 54.4% 15.8% 12.3% 14.0% 3.5%
或者,&#34;不需要&#34;他们都:
glimpse(unnest(scraped_tables))
## Observations: 4,760
## Variables: 15
## $ term <chr> "1171 - Spring 2017", "1171 - Spring 2017", "1171 - Spring 2017", "1171 - Spring 2017", "1171...
## $ instructor_name <chr> "Elias, Desiree", "Elias, Desiree", "Elias, Desiree", "Elias, Desiree", "Elias, Desiree", "El...
## $ course <chr> "ACG 2021", "ACG 2021", "ACG 2021", "ACG 2021", "ACG 2021", "ACG 2021", "ACG 20...
## $ department <chr> "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOUNT", "SCHACCOU...
## $ section <chr> "RVC -1", "RVC -1", "RVC -1", "RVC -1", "RVC -1", "RVC -1", "RVC -1", "RVC -1", "U01 -1", "U0...
## $ ref <chr> "15164 -1", "15164 -1", "15164 -1", "15164 -1", "15164 -1", "15164 -1", "15164 -1", "15164 -1...
## $ title <chr> "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Decisions", "ACC Dec...
## $ completed_forms <chr> "57", "57", "57", "57", "57", "57", "57", "57", "47", "47", "47", "47", "47", "47", "47", "47...
## $ question <chr> "Description of course objectives and assignments", "Communication of ideas and information",...
## $ no_response <chr> "0.0%", "0.0%", "0.0%", "3.5%", "1.8%", "1.8%", "0.0%", "0.0%", "0.0%", "0.0%", "0.0%", "2.1%...
## $ excellent <chr> "64.9%", "56.1%", "63.2%", "50.9%", "59.6%", "52.6%", "52.6%", "54.4%", "66.0%", "59.6%", "66...
## $ very_good <chr> "14.0%", "17.5%", "12.3%", "21.1%", "10.5%", "12.3%", "19.3%", "15.8%", "23.4%", "23.4%", "23...
## $ good <chr> "14.0%", "15.8%", "14.0%", "10.5%", "14.0%", "17.5%", "10.5%", "12.3%", "8.5%", "8.5%", "8.5%...
## $ fair <chr> "3.5%", "5.3%", "8.8%", "14.0%", "10.5%", "7.0%", "10.5%", "14.0%", "0.0%", "6.4%", "2.1%", "...
## $ poor <chr> "3.5%", "5.3%", "1.8%", "0.0%", "3.5%", "8.8%", "7.0%", "3.5%", "2.1%", "2.1%", "0.0%", "0.0%...
我们也可以在这里处理%
:
unnest(scraped_tables) %>%
mutate_all(~{gsub("%", "", .x)}) %>%
type_convert() %>%
select(-c(1:8))
## # A tibble: 4,760 x 7
## question no_response excellent very_good good fair poor
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Description of course objectives and assignments 0.0 64.9 14.0 14.0 3.5 3.5
## 2 Communication of ideas and information 0.0 56.1 17.5 15.8 5.3 5.3
## 3 Expression of expectations for performance in this class 0.0 63.2 12.3 14.0 8.8 1.8
## 4 Availability to assist students in or out of class 3.5 50.9 21.1 10.5 14.0 0.0
## 5 Respect and concern for students 1.8 59.6 10.5 14.0 10.5 3.5
## 6 Stimulation of interest in course 1.8 52.6 12.3 17.5 7.0 8.8
## 7 Facilitation of learning 0.0 52.6 19.3 10.5 10.5 7.0
## 8 Overall assessment of instructor 0.0 54.4 15.8 12.3 14.0 3.5
## 9 Description of course objectives and assignments 0.0 66.0 23.4 8.5 0.0 2.1
## 10 Communication of ideas and information 0.0 59.6 23.4 8.5 6.4 2.1
## # ... with 4,750 more rows
然后,你可以做一些有趣的事情:
unnest(scraped_tables) %>%
mutate_all(~{gsub("%", "", .x)}) %>%
type_convert() -> scraped_tables
group_by(scraped_tables, course) %>%
filter(question == "Description of course objectives and assignments") %>%
gather(resp_cat, resp_val, no_response, excellent, very_good, good, fair, poor) %>%
mutate(resp_val = resp_val/100) %>%
mutate(resp_cat = factor(resp_cat, levels=unique(resp_cat))) %>%
filter(resp_val > 0) %>%
ungroup() -> description_df
ggplot(description_df, aes(resp_val)) +
geom_density(aes(y=..count..), fill="#b2abd288", color="#2d004b") +
scale_x_percent() +
scale_y_comma() +
facet_wrap(~resp_cat) +
labs(title="Description of course objectives and assignments distribution across courses") +
theme_ipsum_rc(grid="XY")
<强>更新强>
magrittr
管道起初可能令人生畏。这是一个没有它们的版本:
library(rvest)
library(httr)
library(stringi)
library(hrbrthemes)
library(tidyverse)
mcga <- function(tbl) {
x <- colnames(tbl)
x <- tolower(x)
x <- gsub("[[:punct:][:space:]]+", "_", x)
x <- gsub("_+", "_", x)
x <- gsub("(^_|_$)", "", x)
x <- make.unique(x, sep = "_")
colnames(tbl) <- x
tbl
}
eval_pg <- read_html("https://opir.fiu.edu/instructor_eval.asp")
term_nodes <- html_nodes(eval_pg, "select[name='Term'] > option")
data_frame(
name = html_text(term_nodes),
id = html_attr(term_nodes, "value")
) -> Terms
Terms
college_nodes <- html_nodes(eval_pg, "select[name='Coll'] > option")
data_frame(
name = html_text(college_nodes),
id = html_attr(college_nodes, "value")
) -> Coll
Coll
GET("https://opir.fiu.edu/instructor_evals/instr_eval_result.asp",
query = list(
Term = "1171",
Coll = "CBADM",
Dept = "",
RefNum = "",
Crse = "",
Instr = ""
)) -> res
report <- content(res, as="parsed", encoding="UTF-8")
fields <- c("Term:", "Instructor Name:", "Course:", "Department:", "Section:",
"Ref#:", "Title:", "Completed Forms:")
tables_found <- html_nodes(report, xpath=".//table[contains(., 'Term')]")
pb <- progress_estimated(length(tables_found))
map(tables_found, function(.x) {
pb$tick()$print()
tab <- .x
map(fields, function(.x) {
tmp_field <- html_nodes(tab, xpath=sprintf(".//td[contains(., '%s')]", .x))
tmp_field <- html_text(tmp_field, trim = TRUE)
tmp_field <- stri_replace_first_regex(tmp_field, .x, "")
tmp_field <- stri_trim_both(tmp_field)
tmp_field <- as.list(tmp_field)
tmp_field <- set_names(tmp_field, .x)
tmp_field
}) -> tmp_meta
tmp_meta <- flatten(tmp_meta)
tmp_meta <- as_data_frame(tmp_meta)
table_meta <- mcga(tmp_meta)
tmp_vals <- html_nodes(tab, xpath=".//tr[contains(@class, 'question') or contains(@class, 'tableback')]")
tmp_vals <- as.character(tmp_vals)
tmp_vals <- paste0(tmp_vals, collapse="")
tmp_vals <- sprintf("<table>%s</table>", tmp_vals)
tmp_vals <- read_html(tmp_vals)
tmp_vals <- html_table(tmp_vals, header=TRUE)[[1]]
table_vals <- mcga(tmp_vals)
table_meta$values <- list(table_vals)
table_meta
}) -> list_of_tables
scraped_tables <- bind_rows(list_of_tables)
glimpse(scraped_tables)
unnest(scraped_tables[1,])
tmp_df <- unnest(scraped_tables[1,])
select(tmp_df, -c(1:8))
glimpse(unnest(scraped_tables))
tmp_df <- unnest(scraped_tables)
tmp_df <- mutate_all(tmp_df, function(.x) { gsub("%", "", .x) })
scraped_tables <- type_convert(tmp_df)
(我删除了评论,因为它们的部分仍然相同)
答案 1 :(得分:1)
不确定您的预期结果是什么,但看起来您需要进行一些模式匹配,因为这些列中的字符串包含的文本不是您在示例中使用的文本。类似的东西:
library(stringr)
d <- fle[!(str_detect(fle$V1, "Term:") | str_detect(fle$V2, "Department:") |
str_detect(fle$V1, "Course:") | str_detect(fle$V2, "Section:")), ]