Question

因此，我编写了一些代码来获取有关大学课程的数据，以构建交互式调度程序。这是我获取数据的代码：

from selenium import webdriver
import os
import pwd
import shlex
import re
import time


usr = pwd.getpwuid(os.getuid()).pw_name
Path = ('/Users/%s/Downloads/chromedriver') %usr # Have chromedriver dowloaded
# Create a new instance of the Chrome driver
options = webdriver.ChromeOptions()
options.binary_location = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
options.add_argument('headless')  # Headless so no window is opened
options.add_argument('window-size=1200x600')
driver = webdriver.Chrome(Path, chrome_options=options) 


driver.get('https://web.stevens.edu/scheduler/core/2017F/2017F.xml') # Go to database


classes = {}

def Database(AllSelectedCourseInfo):
    ClassDict = {}

    for item in AllSelectedCourseInfo: # Go through list of class info
        try:
            thing = item.split("=") # Split string by = to get subject name and value
            name = thing[0]
            if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers
                thing[1] = re.sub("[Z]","",thing[1])
            value = thing[1]
            if value:   # If subject has a value, store it
                ClassDict[str(name)] = str(value)  # Store value in a dictionary with the subject as the key
        except:
            pass

    classes[str(ClassDict["Section"])] = ClassDict # Add to dictionary


def makeDatabase(section):


    if "Title" in driver.find_element_by_xpath("//*[text()='%s']"%section).find_element_by_xpath("..").text: 
        classSection = driver.find_elements_by_xpath("//*[text()='%s']"%section) # If class name given find class

        for i in range(0, len(classSection)):
            AllSelectedCourseInfo = shlex.split(classSection[i].find_element_by_xpath(".." + "/.."*4).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols 
            Database(AllSelectedCourseInfo)

    else:
        classSection = driver.find_element_by_xpath("//*[text()='%s']"%section) # If class section give, find class
        AllSelectedCourseInfo = shlex.split(classSection.find_element_by_xpath(".." + "/.."*3).text.replace("/>", "").replace(">", "")) # sort into a list grouping string in quotes and getting rid of unnecessary symbols 
        Database(AllSelectedCourseInfo)


def printDic():
    for key in classes:
        print "\n-------------%s------------" %key
        for classkey in classes[key]:
            print "%s : %s" %(classkey, classes[key][classkey])

start = time.time()
makeDatabase("Differential Calculus")
makeDatabase("MA 124B")
printDic()
end = time.time()

print end - start

driver.quit()

从一个班级和一个班级部分提取数据需要大约20秒钟，如果我要实现这一点，那么至少需要7个班级，而这只需要一分钟就可以创建词典。有没有人知道如何让这种运行更快？

Answer 1

我尝试将lxml和请求集成到我的代码中，但它只是没有我想要的东西。在尝试使用lxml完成此操作几天后无效，我决定尝试使用urllib的beautifulsoup4。这比我希望的更好，

from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
import urllib
import shlex
import re
import time

h = HTMLParser()
page = urllib.urlopen('https://web.stevens.edu/scheduler/core/2017F/2017F.xml').read() # Get to database
soup = BeautifulSoup(page)

RawClassData = soup.contents[10].contents[0].contents[0].contents

classes = {}
backupClasses = {}

def makeDatabase():


    for i in range(0, len(RawClassData)): # Parse through each class
        try:
            AllSelectedCourseInfo = shlex.split(h.unescape(str(RawClassData[i]).replace(">", " "))) # sort into a list grouping string in quotes and getting rid of unnecessary symbols 
            ClassDict = {}

            for item in AllSelectedCourseInfo: # Go through list of class info
                try:
                    thing = item.split("=") # Split string by = to get subject name and value
                    name = thing[0]
                    if any(char.isdigit() for char in thing[1]): # Get rid of annoying Z at the end of numbers
                        thing[1] = re.sub("[Z]","",thing[1])
                    value = thing[1]
                    if value:   # If subject has a value, store it
                        ClassDict[str(name)] = str(value)  # Store value in a dictionary with the subject as the key
                except:
                    pass

            classes[str(ClassDict["section"])] = ClassDict
        except:
            pass


def printDic():
    with open("Classes", "w") as f:
        for key in classes:
            f.write("\n-------------%s------------" %key)
            for classkey in classes[key]:
                f.write( "\n%s : %s" %(classkey, classes[key][classkey]))
            f.write("\n")

def printSection(selection):
    print "\n-------------%s------------" %selection
    for classkey in classes[selection]:
        print "%s : %s" %(classkey, classes[selection][classkey])

def printClass(selection):
    try:
        for key in classes:
            if classes[key]["title"] == selection:
                print "\n-------------%s------------" %key
                for classkey in classes[key]:
                    print "%s : %s" %(classkey, classes[key][classkey])
    finally:
        print "\n-------------%s------------" %selection
        for classkey in classes[selection]:
            print "%s : %s" %(classkey, classes[selection][classkey])

start = time.time()

makeDatabase()

end = time.time()

printClass("Circuits and Systems")
printClass("Differential Equations")
printClass("Writing & Communications Collqm")
printClass("Mechanics of Solids")
printClass("Electricity & Magnetism")
printClass("Engineering Design III")
printClass("Freshman Quiz")

printDic()

print end - start

这个新代码创建了一个包含所有类的库，然后在2秒内打印出所需的类。 selenium代码需要89秒才能为所需的类构建库并将其打印出来，我会说这稍微改进了......感谢大家提出完美的建议！

优化硒代码

1 个答案: