我写了一个程序来抓取一个中文网站,但是有一些问题发生在我身上,我运行项目,什么都没发生,我不知道为什么,这里是代码。
(3) rennes@L:~/L/crawlAll$ tree
.
├── crawlAll
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── spiders
│ │ ├── __init__.py
│ │ └── TouTiao.py
│ └── useragent.py
├── LICENSE
├── README.md
└── scrapy.cfg
文件:useragent.py
# -*-coding:utf-8-*-
#import logging
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
class MyUserAgentMiddleware(UserAgentMiddleware):
def __init__(self,user_agent = 'Scrapy'):
super(MyUserAgentMiddleware,self).__init__()
self.user_agent = user_agent
def process_request(self,request,spider):
ua = random.choice(self.user_agent_list)
if ua:
#logger = logging.getLogger('')
print("******Current User Agent :%s***********"),ua
#logging.warning("Current User Agent:" + ua , logging.INFO)
request.headers.setdefault('User-Agent',ua)
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
Toutiao.py
# -*- coding: utf-8 -*-
import scrapy
import json
import time
from crawlAll.items import NewsSpiderItem
class TouTiaoSpider (scrapy.Spider):
name = "toutiao"
allowed_domains = ["toutiao.com"]
start_urls = ['http://www.toutiao.com/articles_news_society/p1/']
base_cat_url = 'http://www.toutiao.com/articles_news_society'
base_url = 'http://www.toutiao.com'
maxpage = 1
category = [
'articles_news_society',
]
def parse(self, response):
for ctg in self.category:
for page in range(1,self.maxpage):
url = self.base_url + '/' + ctg + '/p' + page
yield scrapy.Request(url,callback = self.parseNewsHref)
def parseNewsHref(self, response):
urls = response.xpath("//div[@class='info']//a/@href").extract()
for url in urls:
new_url = self.base_url + url
yield scrapy.Request(new_url, callback = self.parseNews)
def parseNews(self, response):
articles = response.xpath("//div[@id='article-main']")
item = NewsSpiderItem()
title = articles.xpath("//h1/text()").extract()[0]
tm = articles.xpath("//span[@class='time']/text()").extract()[0]
content = articles.xpath("//div[@class='article-content']//p/text()").extract()
if(len(title) != 0 and len(tm) != 0 and len(content) != 0):
item['title'] = title
item['time'] = int(time.mktime(time.strptime(tm,'%Y-%m-%d %H:%M')))
item['url'] = response.url
cc = ''
if(len(content) != 0):
for c in content:
cc = cc + c + '\n'
item['content'] = cc
yield item
settings.py
BOT_NAME = 'crawlAll'
SPIDER_MODULES = ['crawlAll.spiders']
NEWSPIDER_MODULE = 'crawlAll.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = False
DOWNLOADER_MIDDLEWARES = {
'crawlAll.useragent.MyUserAgentMiddleware':400,
'crawlAll.middlewares.MyCustomDownloaderMiddleware': None
}
但这不起作用,任何人都可以解决这个问题吗? 非常感谢!
答案 0 :(得分:0)
在您的解析功能中,您无法获取网址,因为您没有输入。删除for或增加maxpage。
def parse(self, response):
for ctg in self.category:
url = self.base_url + '/' + ctg + '/p' + page
yield scrapy.Request(url,callback = self.parseNewsHref)