#!/usr/bin/ruby
# Fetches all Virginia Tech classes from the timetable and spits them out into a nice JSON object
# Can be run with option of which file to save output to or will save to classes.json by default
require 'rubygems'
require 'mechanize'
require 'nokogiri'
require 'json'
#Create Mechanize Browser and Class Data hash to load data into
agent = Mechanize.new
classData = Hash.new
#Get Subjects from Timetable page
page = agent.get("https://banweb.banner.vt.edu/ssb/prod/HZSKVTSC.P_ProcRequest")
subjects = page.forms.first.field_with(:name => 'subj_code').options
#Loop subjects
subjects.each do |subject|
#Get the Timetable Request page & Form
timetableSearch = agent.get("https://banweb.banner.vt.edu/ssb/prod/HZSKVTSC.P_ProcRequest")
searchDetails = page.forms.first
#Submit with specific subject
searchDetails.set_fields({
:SUBJ_CODE => subject,
:TERMYEAR => '201401',
:CAMPUS => 0
})
#Submit the form and store results into courseListings
courseListings = Nokogiri::HTML(
searchDetails.submit(searchDetails.buttons[0]).body
)
#Create Array in Hash to store all classes for subjects
classData[subject] = []
#For every Class
courseListings.css('table.dataentrytable/tr').collect do |course|
subjectClassesDetails = Hash.new
#Map Table Cells for each course to appropriate values
[
[ :crn, 'td[1]/p/a/b/text()'],
[ :course, 'td[2]/font/text()'],
[ :title, 'td[3]/text()'],
[ :type, 'td[4]/p/text()'],
[ :hrs, 'td[5]/p/text()'],
[ :seats, 'td[6]/text()'],
[ :instructor, 'td[7]/text()'],
[ :days, 'td[8]/text()'],
[ :begin, 'td[9]/text()'],
[ :end, 'td[10]/text()'],
[ :location, 'td[11]/text()'],
# [ :exam, 'td[12]/text()']
].collect do |name, xpath|
#Not an additional time session (2nd row)
if (course.at_xpath('td[1]/p/a/b/text()').to_s.strip.length > 2)
subjectClassesDetails[name] = course.at_xpath(xpath).to_s.strip
end
end
#Add class to Array for Subject!
classData[subject].push(subjectClassesDetails)
end
end
#Write Data to JSON file
open(ARGV[0] || "classes.json", 'w') do |file|
file.print JSON.pretty_generate(classData)
end
以上代码应该从https://banweb.banner.vt.edu/ssb/prod/HZSKVTSC.P_ProcRequest检索数据 但如果我打印subjects.length是打印0所以它显然没有得到正确的数据。给定的术语代码“201401”绝对是正确的。
我注意到当我手动输入浏览器的链接时,主题字段不允许您选择一个选项,直到选择一个术语,但是当我查看页面源时,数据显然已经存在。如何检索此数据?
答案 0 :(得分:0)
我正在查看该vtech页面,我可以看到您需要在subj_code
下拉填充之前首先选择一个TERMYEAR,以便您获得选项。不幸的是,function dropdownlist(listindex)
中的javascript会发生这种情况。 Mechanize不处理javascript,所以这个脚本注定要失败。
您可以选择运行Watir或Selenium等浏览器自动播放器:在此处讨论:How do I use Mechanize to process JavaScript?
或者阅读该页面的来源并解析这些行的值:
document.ttform.subj_code.options[0]=new Option("All Subjects","%",false, false);
document.ttform.subj_code.options[1]=new Option("AAEC - Agricultural and Applied Economics","AAEC",false, false);
document.ttform.subj_code.options[2]=new Option("ACIS - Accounting and Information Systems","ACIS",false, false);
获取选项。您只需使用open-uri
:
require 'open-uri'
page = open("https://banweb.banner.vt.edu/ssb/prod/HZSKVTSC.P_ProcRequest")
page_source = page.read
现在您可以使用正则表达式扫描所有选项:
page_source.scan /document\.ttform.+;/
这将为您提供一个包含所有包含选项的javascript代码的行的数组。更好地制作你的正则表达式,你可以从中提取选项文本。我会看看能否为此提出一些建议,我会回复。希望这会让你朝着正确的方向前进。
我回来了。我能用这个正则表达式解析出所有的subj_code选项:
subjects = page_source.scan(/Option\("(.*?)"/).uniq # remove duplicates
subjects.shift # get rid of the first option because it's just "All Subjects"
subjects.size == 137
希望有所帮助。