他在Scrapy中创造了一只蜘蛛: items.py:
from scrapy.item import Item, Field
class dns_shopItem (Item):
# Define the fields for your item here like:
# Name = Field ()
id = Field ()
idd = Field ()
dns_shop_spider.py:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader.processor import TakeFirst
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from dns_shop.items import dns_shopItem
class dns_shopLoader (XPathItemLoader):
default_output_processor = TakeFirst ()
class dns_shopSpider (CrawlSpider):
name = "dns_shop_spider"
allowed_domains = ["www.playground.ru"]
start_urls = ["http://www.playground.ru/files/stalker_clear_sky/"]
rules = (
Rule (SgmlLinkExtractor (allow = ('/ files / s_t_a_l_k_e_r_chistoe_nebo')), follow = True),
Rule (SgmlLinkExtractor (allow = ('/ files / s_t_a_l_k_e_r_chistoe_nebo')), callback = 'parse_item'),
)
def parse_item (self, response):
hxs = HtmlXPathSelector (response)
l = dns_shopLoader (dns_shopItem (), hxs)
l.add_xpath ('id', "/ html / body / table [2] / tbody / tr [5] / td [2] / table / tbody / tr / td / div [6] / h1/text ()" )
l.add_xpath ('idd', "/ / html / body / table [2] / tbody / tr [5] / td [2] / table / tbody / tr / td / div [6] / h1/text () ")
return l.load_item ()
运行以下命令:
scrapy crawl dns_shop_spider-o scarped_data_utf8.csv-t csv
此日志显示Scrapy通过所有必需的URL,但为什么不在启动蜘蛛时写入指定的文件。可能是什么问题?
答案 0 :(得分:2)
假设您要关注页面http://www.playground.ru/files/stalker_clear_sky/上的所有链接,并获取要下载的标题,网址和链接:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader.processor import TakeFirst
from scrapy.contrib.loader import XPathItemLoader
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
class PlayGroundItem(Item):
title = Field()
url = Field()
download_url = Field()
class PlayGroundLoader(XPathItemLoader):
default_output_processor = TakeFirst()
class PlayGroundSpider(CrawlSpider):
name = "playground_spider"
allowed_domains = ["www.playground.ru"]
start_urls = ["http://www.playground.ru/files/stalker_clear_sky/"]
rules = (
Rule(SgmlLinkExtractor(allow=('/files/s_t_a_l_k_e_r_chistoe_nebo')), follow=True, callback='parse_item'),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
l = PlayGroundLoader(PlayGroundItem(), hxs)
l.add_value('url', response.url)
l.add_xpath('title', "//div[@class='downloads-container clearfix']/h1/text()")
l.add_xpath('download_url', "//div[@class='files-download-holder']/div/a/@href")
return l.load_item()
将其保存到spider.py
并通过以下方式运行:
scrapy runspider test_scrapy.py -o output.json
然后检查output.json
。
希望有所帮助。