我正在尝试抓取并使用scrapy从页面获取所有链接。
当我在终端scrapy crawl crawl1 -o items.csv -t csv
中这样运行时。我确实看到它确实抓取并获得如下所示的链接,但它并没有在提到的输出文件中写入任何内容。
2016-12-05 16:17:33 [scrapy] DEBUG: Crawled (200) <GET http://www.abof.com/men/new-in/footwear> (referer: http://www.abof.com/)
2016-12-05 16:17:33 [scrapy] DEBUG: Crawled (200) <GET http://www.abof.com/> (referer: http://www.abof.com/)
2016-12-05 16:17:33 [scrapy] DEBUG: Crawled (200) <GET http://www.abof.com/skult> (referer: http://www.abof.com/)
我也试过了Scrapy does not write data to a file。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from crawl.items import CrawlItem
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import TakeFirst
class CrawlLoader(XPathItemLoader):
default_output_processor = TakeFirst()
class MySpider(CrawlSpider):
name = "crawl1"
allowed_domains = ["www.abof.com"]
start_urls = ["http://www.abof.com/"]
#follow= True
rules = (Rule(SgmlLinkExtractor(allow=()), callback="parse_items", ),)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[@class="pl"]')
items = []
l = CrawlLoader(CrawlItem(), hxs)
for titles in titles:
item = CrawlItem()
# l.add_value("url",response.url)
# l.add_xpath("title",titles.xpath("a/text()").extract())
# l.add_xpath("link",titles.xpath("a/@href").extract()))
item["title"] = titles.xpath("a/text()").extract()
item["url"] = titles.xpath("a/@href").extract()
items.append(item)
return(items)
# return l.load_item()
items.py
import scrapy
class CrawlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
pass
感谢任何帮助。
答案 0 :(得分:0)
通过更改parse_items函数以此方式工作。现在尝试解析javascript中的图像和其他数据。
class CrawlLoader(XPathItemLoader):
default_output_processor = TakeFirst()
class MySpider(CrawlSpider):
name = "crawl1"
allowed_domains = ["www.abof.com"]
start_urls = ["http://www.abof.com/"]
rules = (Rule(SgmlLinkExtractor(allow=()), callback="parse_items",follow= True ),)
def parse_items(self, response):
href = CrawlItem()
href['url'] = response.url
href["title"] = response.xpath("//title/text()").extract()
return href