我使用Python,Scrapy,Selenium和Phantomjs(用于动态加载的内容)制作网络爬虫/刮刀。我的程序在网站(http://www.pcworld.com/)上查找用户提供的一些输入。然后它会获取搜索结果中所有文章的链接。每个文章都会启动一个请求。然后代码获取文章的标题,articletext,url和发布日期,并将其保存到Mysql DB(dbname = pcworld,username = testuser,passwd = test123)。 问题是,在获取一些文章之后,该程序只是停止并且什么都不做。命令行中没有响应。
这是蜘蛛:
# -*- coding: utf-8 -*-
#!/usr/bin/python
'''
@param: scrapy crawl pcworld_spider -a external_input="EXAMPLEINPUT"
'''
import scrapy
from scrapy.http.request import Request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
import time
import lxml.html
import json
import MySQLdb
from Thesis.items import ThesisItem
class PcworldSpider(scrapy.Spider):
name = "pcworld_spider"
custom_settings = {
'ITEM_PIPELINES': {
'Thesis.pipelines.PcworldPipeline': 50
}
}
allowed_domains = ['pcworld.com']
start_urls = ['http://www.pcworld.com/search?query=']
global_next_page = ''
def __init__(self, external_input):
self.external_input = external_input
super(PcworldSpider, self).__init__() #rufe ich auf um external_input an pipeline zu übergeben
self.start_urls[0] = self.start_urls[0] + external_input
self.global_next_page = self.start_urls[0]
# Open database connection
self.db = MySQLdb.connect("localhost","testuser","test123","pcworld" )
# prepare a cursor object using cursor() method
self.cursor = self.db.cursor()
# Drop table if it already exist using execute() method. #whitespace entfernen wegen mysql
self.cursor.execute("DROP TABLE IF EXISTS %s" %external_input.replace(" ", ""))
# Create table as per requirement #whitespace entfernen wegen mysql
sql = """CREATE TABLE """+external_input.replace(" ", "")+""" (
Ueberschrift varchar(1500),
Article text,
Datum date,
Original_URL varchar(1500))"""
try:
# Execute the SQL command
self.cursor.execute(sql)
# Commit your changes in the database
self.db.commit()
except:
print(" ")
print("IM ROLLING BACK in PARSE")
print(" ")
# Rollback in case there is any error
self.db.rollback()
self.driver = webdriver.PhantomJS()
self.driver.set_window_size(1120, 550)
#self.driver = webdriver.Chrome("C:\Users\Daniel\Desktop\Sonstiges\chromedriver.exe")
self.driver.wait = WebDriverWait(self.driver, 4) #wartet bis zu 4 sekunden
def parse(self, response):
print("\n1\n")
self.driver.get(self.global_next_page)
print("\n2\n")
#wartet bis zu 4 sekunden(in __init__() definiert) auf den Eintritt der Condition, danach schmeist er den TimeoutException error
try:
self.driver.wait.until(EC.presence_of_element_located(
(By.CLASS_NAME, "excerpt-text")))
print("Found : excerpt-text")
except TimeoutException:
#closeself.driver.close()
print(" excerpt-text NOT FOUND IN PCWORLD !!!")
print("\n3\n")
#Crawle durch Javascript erstellte Inhalte mit Selenium
ahref = self.driver.find_elements(By.XPATH,'//div[@class="excerpt-text"]/h3/a')
print("\n4\n")
hreflist = []
#Alle Links zu den jeweiligen Artikeln sammeln
for elem in ahref :
hreflist.append(elem.get_attribute("href"))
print("\n5\n")
for elem in hreflist :
print(elem)
#self.driver.implicitly_wait(2)
yield scrapy.Request(url=elem , callback=self.parse_content)
print("\n6\n")
#Den Link fuer die naechste Seite holen
try:
if(self.driver.find_elements(By.XPATH,"//a[@rel='next']")):
#print("es gibt next")
next = self.driver.find_element(By.XPATH,"//a[@rel='next']")
self.global_next_page = next.get_attribute("href")
yield scrapy.Request(url=self.global_next_page, callback=self.parse, dont_filter=True) #das ist der richtige Request!
print(" ")
else:
print("next gibt es nicht!")
except TimeoutException:
print("TIMEOUTEXCEPTION WHILE SEARCHING FOR NEXT")
#self.driver.close()
print("\n7\n")
def parse_content(self, response):
print("\n8\n")
self.driver.get(response.url)
print("\n9\n")
title = self.driver.find_element(By.XPATH,"//h1[@itemprop='headline']")
titletext = title.get_attribute("innerHTML")
titletext = titletext.replace('\n', ' ').replace('\r', '') #newlines/carriagereturns entfernen weil sonst lxml fehler schmeißt
titletext = lxml.html.fromstring(titletext).text_content()
date = self.driver.find_element(By.XPATH,"//meta[@name='date']")
date_text = date.get_attribute("content")
article = self.driver.find_elements(By.XPATH,"//div[contains(@itemprop, 'articleBody')]//p")
article_list = []
print("\n10\n")
for elem in article:
print(" ")
elem_text = elem.get_attribute("innerHTML")
elem_text = elem_text.replace('\n', ' ').replace('\r', '') #newlines/carriagereturns entfernen weil sonst lxml fehler schmeißt
#print(elem_text.encode("utf-8"))
article_list.append(elem_text)
article_text = ' '.join(article_list) #article Liste in einen String umwandeln
article_text = lxml.html.fromstring(article_text).text_content()
pcworld_data = ThesisItem()
pcworld_data['Ueberschrift'] = titletext
pcworld_data['Article'] = article_text
pcworld_data['Datum'] = date_text
pcworld_data['Original_URL'] = response.url
print("\n11\n")
#returns the item -> next thing called is Pipeline with class ThesisItem()
return pcworld_data
管道代码:
# -*- coding: utf-8 -*-
import sys
import MySQLdb
from twisted.enterprise import adbapi
from twisted.internet import reactor
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class PcworldPipeline(object):
global_external_input = ''
#wird von der spider aufgerufen um das argument zu übergeben
def open_spider(self, spider):
self.global_external_input = spider.external_input.replace(" ", "") #whitespace entfernen wegen mysql
def __init__(self):
self.db = MySQLdb.connect("localhost","testuser","test123","pcworld" )
self.cursor = self.db.cursor()
def process_item(self, item, spider):
data = item
data["Ueberschrift"] = data["Ueberschrift"].replace("'", '"') # replace ' with " because of sql
data["Article"] = data["Article"].replace("'", '') # replace ' with nothing because of sql
#data['Article'] = data['Article'].replace(u"\u2018", "'").replace(u"\u2019", "'")
# Prepare SQL query to INSERT a record into the database.
sql = """INSERT INTO {0} (Ueberschrift, Article, Datum, Original_URL)
VALUES ('{1}', '{2}', '{3}', '{4}')
""".format(self.global_external_input , data['Ueberschrift'].encode('utf-8'), data['Article'].encode('utf-8'),
data['Datum'].encode('utf-8'), data['Original_URL'])
#print(sql.encode("utf-8"))
#print(sql)
try:
# Execute the SQL command
self.cursor.execute(sql)
# Commit your changes in the database
self.db.commit()
except Exception as e:
print(" ")
print("ROLLBACK IN PROCESS_ITEM")
print(e)
# Rollback in case there is any error
self.db.rollback()
return item
我的项目:
import scrapy
class ThesisItem(scrapy.Item):
Ueberschrift = scrapy.Field()
Article = scrapy.Field()
Datum = scrapy.Field()
Original_URL = scrapy.Field()
设置中的内容发生了变化:
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 200
CONCURRENT_ITEMS = 400
DOWNLOAD_DELAY = 4
COOKIES_ENABLED = False
我观察到了什么:
在parse_content中调用self.driver.get(response.url)后,程序总是停止
数据库与代码阻塞无关(我尝试删除所有内容并将项目汇集到json文件中)
我有类似的代码用于其他网站,其中一些网站的工作完美,但在这里阻止。
有关为什么代码在一些(4-10)文章请求后停止的任何建议?
输出:(阻塞之前的最后几行)
6
2017-07-25 09:52:57 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:60768/wd/hub/session/31152680-710e-11e7-bb53-730b5237f77b/elements {"using": "xpath",
"sessionId": "31152680-710e-11e7-bb53-730b5237f77b", "value": "//a[@rel='next']"}
2017-07-25 09:52:57 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2017-07-25 09:52:57 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:60768/wd/hub/session/31152680-710e-11e7-bb53-730b5237f77b/element {"using": "xpath",
"sessionId": "31152680-710e-11e7-bb53-730b5237f77b", "value": "//a[@rel='next']"}
2017-07-25 09:52:57 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2017-07-25 09:52:57 [selenium.webdriver.remote.remote_connection] DEBUG: GET http://127.0.0.1:60768/wd/hub/session/31152680-710e-11e7-bb53-730b5237f77b/element/:wdc:1500969177413/a
ttribute/href {"sessionId": "31152680-710e-11e7-bb53-730b5237f77b", "name": "href", "id": ":wdc:1500969177413"}
2017-07-25 09:52:57 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
7
2017-07-25 09:53:01 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://www.pcworld.com/article/2142565/report-nsa-secretly-exploited-devastating-heartbleed-bug-for-years.html> (
referer: http://www.pcworld.com/search?query=heartbleed&start=10)
8
2017-07-25 09:53:01 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:60768/wd/hub/session/31152680-710e-11e7-bb53-730b5237f77b/url {"url": "http://www.pcw
orld.com/article/2142565/report-nsa-secretly-exploited-devastating-heartbleed-bug-for-years.html", "sessionId": "31152680-710e-11e7-bb53-730b5237f77b"}
我做了另一个测试:输出:https://pastebin.com/XAky7YJP(此处在第一次请求后停止)
我正在使用python 2.7和scrapy 1.4.0