我过去2到3周一直在研究Scrapy,具有以下依赖关系:
现在,我已将依赖项升级到
更新我的依赖项后,我的蜘蛛卡住了。 在产生请求后它会卡住,也不会启动回调函数,蜘蛛会进入无限等待状态。
ScrapperRequestCode :
def start_requests(self):
driver = webdriver.Chrome()
driver.get(**)
urls = driver.find_elements_by_xpath("//div[contains(@class, 'media__item')]//a")
time_list = driver.find_elements_by_xpath("//span[contains(@class, 'timestamp--time')]")
index = 0
for url in urls:
url_href = str(url.get_attribute('href')).replace('http:', 'https:')
media_url = url.find_element_by_xpath("./*").get_attribute('src')
text = url.find_element_by_xpath("./*").get_attribute('alt')
try:
time = time_list[index].get_attribute('title')
time = str(datetime.datetime.strptime(str(time).lstrip(), '%b %d, %Y %I:%M%p'))
index = index + 1
except IndexError:
time = ''
pass
if url_href != "None":
count = News.objects.filter(url__icontains=url_href).count()
if count == 0:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/60.0.3112.101 Safari/537.36'}
request = Request(url_href, callback=self.parse, headers=headers, dont_filter=True)
request.meta['url'] = url_href
request.meta['text'] = text
request.meta['media_url'] = media_url
request.meta['time'] = time
yield request
sleep(5)
driver.close()
解析功能代码:
def parse(self, response):
print(response, "RESPONSE===========================")
summary_list = []
story_url = response.meta['url']
story_title = response.meta['text']
media_url = response.meta['media_url']
time = response.meta['time']
try:
dummay_news = News.objects.get(url=story_url)
return
except News.DoesNotExist:
dummay_news = News()
story_p_tags = response.xpath("//div[contains(@class,'story__content')]/p/text()")
for p in story_p_tags:
summary_list.append(p.extract())
dummay_news.source = 'xyz'
dummay_news.time = time
dummay_news.title = story_title
dummay_news.url = story_url
dummay_news.set_summary(summary_list)
if media_url:
dummay_news.media_url = media_url
dummay_news.save()
yield dummay_news
此请求后Scrapper卡住了:
2018-05-02 13:47:18 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-02 13:47:18 [selenium.webdriver.remote.remote_connection] DEBUG: GET http://127.0.0.1:35091/session/657a553b268a671e8117dba401ceeee8/element/0.8764352324704219-426/attribute/alt {"id": "0.8764352324704219-426", "name": "alt", "sessionId": "657a553b268a671e8117dba401ceeee8"}
2018-05-02 13:47:18 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2018-05-02 13:47:18 [selenium.webdriver.remote.remote_connection] DEBUG: GET http://127.0.0.1:35091/session/657a553b268a671e8117dba401ceeee8/element/0.8764352324704219-226/attribute/title {"id": "0.8764352324704219-226", "name": "title", "sessionId": "657a553b268a671e8117dba401ceeee8"}
2018-05-02 13:47:18 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
此外,没有回调函数被启动,并且在发生此错误后它会永远停止。任何帮助都非常感谢,提前谢谢。
答案 0 :(得分:0)
你在start_requests()方法中正在做driver.close()
。但是,我认为你不应该过早关闭硒驱动因素。删除此行并尝试。
答案 1 :(得分:0)
请勿在{{1}}中关闭您的驱动程序。 你可以像这样关闭并退出
start_requests
您的def __del__(self):
self.driver.close()
self.driver.quit()
应该在yield dummay_news
except