我正在使用Selenium在Python中编写一个Web爬虫,它转到页面,抓取页面的源代码,按下JavaScript按钮转到下一页然后重复。当我运行我的代码时,它会启动Firefox并导航所有页面,但是当它完成并尝试对页面内容做任何事情时,它会返回:
Traceback (most recent call last):
File "C:\Users\...\crawler.py", line 24, in test_pull
print(contents)
ValueError: I/O operation on closed file.
我尝试执行的代码是:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
import unittest, time, re
import sys,io
from Parser import HTMLParser
class TestPull(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
self.driver.implicitly_wait(30)
self.base_url = "some_url"
self.verificationErrors = []
self.accept_next_alert = True
def test_pull(self):
driver = self.driver
driver.get(self.base_url)
contents = ""
num = 1
valid = 1
while valid == 1:
num += 1
contents += driver.page_source
if self.is_element_present(By.ID, "contentright_3_next_page") == True:
driver.find_element_by_id("contentright_3_next_page").click()
else:
valid = 0
parser = HTMLParser()
parser.feed(contents)
def is_element_present(self, how, what):
try: self.driver.find_element(by=how, value=what)
except: return False
return True
def is_alert_present(self):
try: self.driver.switch_to_alert()
except: return False
return True
def close_alert_and_get_its_text(self):
try:
alert = self.driver.switch_to_alert()
alert_text = alert.text
if self.accept_next_alert:
alert.accept()
else:
alert.dismiss()
return alert_text
finally: self.accept_next_alert = True
def tearDown(self):
self.driver.quit()
self.assertEqual([], self.verificationErrors)
if __name__ == "__main__":
unittest.main()
如果我删除了print语句,输出仍然是:
Traceback (most recent call last):
File "C:\Users\...\crawler.py", line 36, in test_pull
parser.feed(contents)
File "C:\Python34\lib\html\parser.py", line 165, in feed
self.goahead(0)
File "C:\Python34\lib\html\parser.py", line 222, in goahead
k = self.parse_starttag(i)
File "C:\Python34\lib\html\parser.py", line 413, in parse_starttag
self.handle_starttag(tag, attrs)
File "C:\Users\...\crawler.py", line 20, in handle_starttag
print(attrs[1][1])
ValueError: I/O operation on closed file.
我的解析器是:
from urllib.request import urlopen
from html.parser import HTMLParser
import sys,io
text_file = open("output.txt", "w")
class HTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
if attrs[0][0] == 'title':
print(attrs[1][1])
text_file.write(str(attrs[1][1]) + '\n')
def handle_endtag(self, tag):
''' if tag == 'a':
print(HTMLParser.getpos(self))
print("Encountered a end tag:", tag)'''
def handle_data(self, data):
'''print("Encountered some data :", data)'''
text_file.close()
我已经搜索了这个错误的解决方案,我发现的每个问题都与一个打开,关闭的文件有关(在for / while循环中),然后他们尝试访问/写入该文件。我已对代码进行了更改,因此它正在写入单个字符串(而不是写入文本文件),然后通过我的解析器运行批处理源代码,它仍然给我这个错误。
我不认为应该有一个文件要关闭,我不认为内容字符串应该导致这种冲突(虽然我仍然是相当新的python并且还不知道该语言的所有细微差别)。有人可以解释发生了什么以及如何纠正这个问题吗?
答案 0 :(得分:0)
#!/usr/bin/env python
from urllib.request import urlopen
from html.parser import HTMLParser
import sys,io
class HTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
if attrs[0][0] == 'title':
print(attrs[1][1])
with open('output.txt', 'w') as text_file:
text_file.write(str(attrs[1][1]) + '\n')
def handle_endtag(self, tag):
''' if tag == 'a':
print(HTMLParser.getpos(self))
print("Encountered a end tag:", tag)'''
def handle_data(self, data):
'''print("Encountered some data :", data)'''
text_file.close()