Question

在我的工作中，我搭建了一只抓抓蜘蛛的蜘蛛，可以快速检查200-500个网站的目标网页，以找出网页无法正常工作的线索（除了400种样式的错误）。（例如，检查第“页面上是否有缺货”。）在我的权限范围内，有30个不同的网站，它们都使用相同的页面结构。

每天工作4个月都很好。

然后，突然，并且没有更改代码，大约4周前，我开始出现无法预测的错误：

url_title = response.css（“ title :: text”）。extract_first（） AttributeError：“响应”对象没有属性“ css”

如果运行此蜘蛛，则发生此错误，例如... 400页中的3页。然后，如果立即再次运行Spider，则可以正确地刮擦相同的3页而没有错误，并且4个完全不同的页面将返回相同的错误。

此外，如果我运行如下所示的 EXACT 相同的蜘蛛程序，但仅用这7个错误的着陆页替换了映射，它们就会被完全清除。

我的代码中有一些不太正确的东西吗？

我将附加整个代码-提前对不起！！ -我只是担心我认为多余的事情实际上可能是原因。因此，这就是全部，但是敏感数据替换为####。

我已经检查了所有受影响的页面，并且css当然有效，并且标题始终存在。

我已经在运行scrapy的服务器上完成了sudo apt-get update和sudo apt-get dist-upgrade，希望这会有所帮助。没有运气。

import scrapy
from scrapy import signals
from sqlalchemy.orm import sessionmaker
from datetime import date, datetime, timedelta
from scrapy.http.request import Request
from w3lib.url import safe_download_url
from sqlalchemy import and_, or_, not_


import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
from sqlalchemy.engine import create_engine
engine = create_engine('mysql://######:######@localhost/LandingPages', pool_recycle=3600, echo=False)
#conn = engine.connect()

from LandingPageVerifier.models import LandingPagesFacebook, LandingPagesGoogle, LandingPagesSimplifi, LandingPagesScrapeLog, LandingPagesScrapeResults

Session = sessionmaker(bind=engine)
session = Session()

# today = datetime.now().strftime("%Y-%m-%d")

# thisyear = datetime.now().strftime("%Y")
# thismonth = datetime.now().strftime("%m")
# thisday = datetime.now().strftime("%d")
# start = date(year=2019,month=04,day=09)

todays_datetime = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
print todays_datetime

landingpages_today_fb = session.query(LandingPagesFacebook).filter(LandingPagesFacebook.created_on >= todays_datetime).all()
landingpages_today_google = session.query(LandingPagesGoogle).filter(LandingPagesGoogle.created_on >= todays_datetime).all()
landingpages_today_simplifi = session.query(LandingPagesSimplifi).filter(LandingPagesSimplifi.created_on >= todays_datetime).all()

session.close()
#Mix 'em together!
landingpages_today = landingpages_today_fb + landingpages_today_google + landingpages_today_simplifi
#landingpages_today = landingpages_today_fb

#Do some iterating and formatting work
landingpages_today = [(u.ad_url_full, u.client_id) for u in landingpages_today]
#print landingpages_today

landingpages_today = list(set(landingpages_today))

#print 'Unique pages: '
#print landingpages_today
# unique_landingpages = [(u[0]) for u in landingpages_today]
# unique_landingpage_client = [(u[1]) for u in landingpages_today]
# print 'Pages----->', len(unique_landingpages)

class LandingPage004Spider(scrapy.Spider):
    name='LandingPage004Spider'

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(LandingPage004Spider, cls).from_crawler(crawler, *args, **kwargs)
        #crawler.signals.connect(spider.spider_opened, signals.spider_opened)
        crawler.signals.connect(spider.spider_closed, signals.spider_closed)
        return spider

    def spider_closed(self, spider):
        #stats = spider.crawler.stats.get_stats() 
        stats = spider.crawler.stats.get_value('item_scraped_count'),
        Session = sessionmaker(bind=engine)
        session = Session()
        logitem = LandingPagesScrapeLog(scrape_count = spider.crawler.stats.get_value('item_scraped_count'),
                                        is200 = spider.crawler.stats.get_value('downloader/response_status_count/200'),
                                        is400 = spider.crawler.stats.get_value('downloader/response_status_count/400'),
                                        is403 = spider.crawler.stats.get_value('downloader/response_status_count/403'),
                                        is404 = spider.crawler.stats.get_value('downloader/response_status_count/404'),
                                        is500 = spider.crawler.stats.get_value('downloader/response_status_count/500'),
                                        scrapy_errors = spider.crawler.stats.get_value('log_count/ERROR'),
                                        scrapy_criticals = spider.crawler.stats.get_value('log_count/CRITICAL'),
                                        )
        session.add(logitem)
        session.commit()
        session.close()



    #mapping = landingpages_today
    handle_httpstatus_list = [200, 302, 404, 400, 500]

    start_urls = []

    def start_requests(self):
        for url, client_id in self.mapping:
            yield Request(url, callback=self.parse, meta={'client_id': client_id})


    def parse(self, response):

        ##DEBUG - return all scraped data
        #wholepage = response.body.lower()

        url = response.url
        if 'redirect_urls' in response.request.meta:
            redirecturl = response.request.meta['redirect_urls'][0]
            if 'utm.pag.ca' in redirecturl:
                url_shortener = response.request.meta['redirect_urls'][0]
            else:
                url_shortener = 'None'
        else:
            url_shortener = 'None'

        client_id = response.meta['client_id']
        url_title = response.css("title::text").extract_first()
        # pagesize = len(response.xpath('//*[not(descendant-or-self::script)]'))
        pagesize = len(response.body)
        HTTP_code = response.status

        ####ERROR CHECK: Small page size
        if 'instapage' in response.body.lower():
            if pagesize <= 20000:
                err_small = 1
            else:
                err_small = 0
        else:
            if pagesize <= 35000:
                err_small = 1
            else:
                err_small = 0

        ####ERROR CHECK: Page contains the phrase 'not found'
        if 'not found' in response.xpath('//*[not(descendant-or-self::script)]').extract_first().lower():
            #their sites are full of HTML errors, making scrapy unable to notice what is and is not inside a script element
            if 'dealerinspire' in response.body.lower():
                err_has_not_found = 0
            else:
                err_has_not_found = 1
        else:
            err_has_not_found = 0

        ####ERROR CHECK: Page cotains the phrase 'can't be found'
        if "can't be found" in response.xpath('//*[not(self::script)]').extract_first().lower():
            err_has_cantbefound = 1
        else:
            err_has_cantbefound = 0

        ####ERROR CHECK: Page contains the phrase 'unable to locate'
        if 'unable to locate' in response.body.lower():
            err_has_unabletolocate = 1
        else:
            err_has_unabletolocate = 0

        ####ERROR CHECK: Page contains phrase 'no longer available'
        if 'no longer available' in response.body.lower():
            err_has_nolongeravailable = 1
        else:
            err_has_nolongeravailable = 0

        ####ERROR CHECK: Page contains phrase 'no service specials'
        if 'no service specials' in response.body.lower():
            err_has_noservicespecials = 1
        else:
            err_has_noservicespecials = 0

        ####ERROR CHECK: Page contains phrase 'Sorry, no' to match zero inventory for a search, which normally says "Sorry, no items matching your request were found."
        if 'sorry, no ' in response.body.lower():
            err_has_sorryno = 1
        else:
            err_has_sorryno = 0

        yield {'client_id': client_id, 'url': url, 'url_shortener': url_shortener, 'url_title': url_title, "pagesize": pagesize, "HTTP_code": HTTP_code, "err_small": err_small, 'err_has_not_found': err_has_not_found, 'err_has_cantbefound': err_has_cantbefound, 'err_has_unabletolocate': err_has_unabletolocate, 'err_has_nolongeravailable': err_has_nolongeravailable, 'err_has_noservicespecials': err_has_noservicespecials, 'err_has_sorryno': err_has_sorryno}



#E-mail settings

def sendmail(recipients,subject,body):

            fromaddr = "#######"
            toaddr = recipients
            msg = MIMEMultipart()
            msg['From'] = fromaddr
            msg['Subject'] = subject 

            body = body
            msg.attach(MIMEText(body, 'html'))

            server = smtplib.SMTP('########)
            server.starttls()
            server.login(fromaddr, "##########")
            text = msg.as_string()
            server.sendmail(fromaddr, recipients, text)
            server.quit()
`

期望的结果是完美的尝试，没有错误。实际结果是无法预料的AttributeErrors，声称在某些页面上找不到属性'css'。但是，如果我使用相同的脚本分别刮擦这些页面，它们就刮得很好。

Answer 1

有时Scrapy由于标记错误而无法解析HTML，这就是为什么您不能调用response.css()的原因。您可以在代码中捕获这些事件并分析损坏的HTML：

def parse(self, response):

    try:
     ....
     your code
     .....
    except:
        with open("Error.htm", "w") as f:
            f.write(response.body)

更新，您可以尝试检查是否为空：

def parse(self, response):
    if not response.body:
        yield scrapy.Request(url=response.url, callback=self.parse, meta={'client_id': response.meta["client_id"]})

    # your original code

为什么匆匆忙忙地突然给我一个** Unpredictable * AttributeError，没有说明属性'css'

1 个答案: