Question

我写了一个程序来抓取一个中文网站，但是有一些问题发生在我身上，我运行项目，什么都没发生，我不知道为什么，这里是代码。

(3) rennes@L:~/L/crawlAll$ tree
.
├── crawlAll
│   ├── __init__.py
│   ├── items.py
│   ├── pipelines.py
│   ├── settings.py
│   ├── spiders
│   │   ├── __init__.py
│   │   └── TouTiao.py
│   └── useragent.py
├── LICENSE
├── README.md
└── scrapy.cfg

文件：useragent.py

# -*-coding:utf-8-*-

#import logging
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware


class MyUserAgentMiddleware(UserAgentMiddleware):
    def __init__(self,user_agent = 'Scrapy'):
        super(MyUserAgentMiddleware,self).__init__()
        self.user_agent = user_agent

    def process_request(self,request,spider):
        ua = random.choice(self.user_agent_list)
        if ua:
            #logger = logging.getLogger('')

            print("******Current User Agent :%s***********"),ua
            #logging.warning("Current User Agent:" + ua , logging.INFO)
            request.headers.setdefault('User-Agent',ua)
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

Toutiao.py

# -*- coding: utf-8 -*-
import scrapy
import json
import time
from crawlAll.items import NewsSpiderItem

class TouTiaoSpider (scrapy.Spider):
    name = "toutiao"
    allowed_domains = ["toutiao.com"]
    start_urls = ['http://www.toutiao.com/articles_news_society/p1/']
    base_cat_url = 'http://www.toutiao.com/articles_news_society'
    base_url = 'http://www.toutiao.com'

    maxpage = 1
    category = [
        'articles_news_society',
    ]

    def parse(self, response):
        for ctg in self.category:
            for page in range(1,self.maxpage):
                url = self.base_url + '/' + ctg + '/p' + page
            yield scrapy.Request(url,callback = self.parseNewsHref)

    def parseNewsHref(self, response):
        urls = response.xpath("//div[@class='info']//a/@href").extract()
        for url in urls:
            new_url = self.base_url + url
            yield  scrapy.Request(new_url, callback = self.parseNews)

    def parseNews(self, response):

        articles = response.xpath("//div[@id='article-main']")
        item = NewsSpiderItem()
        title = articles.xpath("//h1/text()").extract()[0]
        tm = articles.xpath("//span[@class='time']/text()").extract()[0]
        content = articles.xpath("//div[@class='article-content']//p/text()").extract()

        if(len(title) != 0 and len(tm) != 0 and len(content) != 0):
            item['title'] = title
            item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
            item['url'] = response.url
            cc = ''
            if(len(content) != 0):
                for c in content:
                    cc = cc + c + '\n'
                    item['content'] = cc
                    yield item

settings.py

BOT_NAME = 'crawlAll'
SPIDER_MODULES = ['crawlAll.spiders']
NEWSPIDER_MODULE = 'crawlAll.spiders'
ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False

DOWNLOADER_MIDDLEWARES = {
    'crawlAll.useragent.MyUserAgentMiddleware':400,
    'crawlAll.middlewares.MyCustomDownloaderMiddleware': None
}

但这不起作用，任何人都可以解决这个问题吗？非常感谢！

Answer 1

在您的解析功能中，您无法获取网址，因为您没有输入。删除for或增加maxpage。

def parse(self, response):
   for ctg in self.category:
      url = self.base_url + '/' + ctg + '/p' + page
      yield scrapy.Request(url,callback = self.parseNewsHref)

如何在scrapy版本1.21中配置scrapy用户代理

1 个答案: