其次,我需要选择课程的代码,名称,日期和时间,并标记学期,我做了。 (可能,我做得非常糟糕,但我做到了,不管怎么样!)
# define the html
reg <- read_html("http://registration.boun.edu.tr/scripts/sch.asp?donem=2017/2018-3&kisaadi=ATA&bolum=ATATURK+INSTITUTE+FOR+MODERN+TURKISH+HISTORY")
# make the html a list of tables
regtable <- reg %>% html_table(fill = TRUE)
# tag their year
regtable[[4]][ ,15] <- regtable[[1]][1,2]
regtable[[4]][1,15] <- "Semester"
# Change the Days and Hours to sth usable, but how and to what?
# parse the dates, T and Th problem?
# parse the hour 10th hour problem?
# get the necessary info
regtable <- regtable %>% .[4] %>% as.data.frame() %>% select( . , X1 , X3 , X8 , X9 , V15)
# correct the names
names(regtable) <- regtable[1,]
regtable <- regtable[-1,]
我想我可以使用XML Package做得更好,但我无法理解如何使用它。
感谢您的帮助, Utku
答案 0 :(得分:2)
# Create a Department list
dep_list <- read_html("http://registration.boun.edu.tr/scripts/schdepsel.asp")
# Take the read html and identify all objects of class menu2 and extract the
# href which will give you the final part of the url
dep_list <- dep_list %>%
html_nodes(xpath = '//*[@class="menu2"]') %>%
department_list <- gsub("/scripts/sch.asp?donem=", "", dep_list, fixed = TRUE)
# Create a list for all of the semesters
sem_list <- read_html("http://registration.boun.edu.tr/schedule.htm")
sem_list <- sem_list %>% html_table(fill = TRUE)
# Extract the table from the list needed
semester_df <- sem_list[[2]]
# The website uses a table for the dropdown but the values are all in the second cell
# of the second column as a string
semester_list <- semester_df$X2[2]
# Separate the string into a list at the space characters
semester_list <- unlist(strsplit(semester_list, "\\s+"))
# Loop through the list of departments and within each department loop through the
# list of semesters to get the data you want
for(dep in department_list){
for(sem in semester_list){
url <- paste("http://registration.boun.edu.tr/scripts/sch.asp?donem=", sem, dep, sep = "")
reg <- read_html(url)
# make the html a list of tables
regtable <- reg %>% html_table(fill = TRUE)
# The data we want is in the 4th portion of the created list so extract that
regtable <- regtable[[4]]
# Rename the column headers to the values in the first row and remove the
# first row
regtable <- setNames(regtable[-1, ], regtable[1, ])
# Create semester column and select the variables we want
regtable <- regtable %>%
mutate(Semester = sem) %>%
select(Code.Sec, Name, Days, Hours, Semester)
# Assign the created table to a dataframe
# Could also save the file here instead
assign(paste("table", sem, gsub(" ", "_", dep), sep = "_"), regtable)
答案 1 :(得分:1)
# Create a Department list
dep_list <- read_html("http://registration.boun.edu.tr/scripts/schdepsel.asp")
dep_list <- dep_list %>% html_table(fill = TRUE)
# Select the table from the html that contains the data we want
department_df <- dep_list[[2]]
# Rename the columns with the value of the first row and remove row
department_df <- setNames(department_df[-1, ], department_df[1, ])
# Combine the two columns into a list
department_list <- c(department_df[, 1], department_df[, 2])
# Edit the department list
# We can choose accordingly.
department_list <- department_list[c(7,8,16,20,26,33,36,37,38,39)]
# Create a list for all of the semesters
sem_list <- read_html("http://registration.boun.edu.tr/schedule.htm")
sem_list <- sem_list %>% html_table(fill = TRUE)
# Extract the table from the list needed
semester_df <- sem_list[[2]]
# The website uses a table for the dropdown but the values are all in the second cell
# of the second column as a string
semester_list <- semester_df$X2[2]
# Separate the string into a list at the space characters
semester_list <- unlist(strsplit(semester_list, "\\s+"))
# Shortnames string
# We can add whichever we want.
shortname_list <- c("FLED", "HIST" , "PSY", "LL" , "PA" , "PHIL" , "YADYOK" , "SOC" , "TR" , "TKL" )
# Length
L = length(department_list)
# the function to get the schedule for the selected departments
for( i in 1:L){
for(sem in semester_list){tryCatch({
dep <- department_list[i]
sn <- shortname_list[i]
url_second_part <- interaction("&kisaadi=" , sn, "&bolum=", gsub(" ", "+", (gsub("&" , "%26", dep))), sep = "", lex.order = TRUE)
url <- paste("http://registration.boun.edu.tr/scripts/sch.asp?donem=", sem, url_second_part, sep = "")
reg <- read_html(url)
# make the html a list of tables
regtable <- reg %>% html_table(fill = TRUE)
# The data we want is in the 4th portion of the created list so extract that
regtable <- regtable[[4]]
# Rename the column headers to the values in the first row and remove the
# first row
regtable <- setNames(regtable[-1, ], regtable[1, ])
# Create semester column and select the variables we want
regtable <- regtable %>%
mutate(Semester = sem) %>%
select(Code.Sec, Name, Days, Hours, Semester)
# Assign the created table to a dataframe
# Could also save the file here instead
assign(paste("table", sem, gsub(" ", "_", dep), sep = "_"), regtable)
}, error = function(e){cat("ERROR : No information on this" , url , "\n" )})
### Maybe make Errors another dataset or list too.