我从csv文件中删除了超过5,000个链接。
程序将每个元素放入输出csv中的单独行。
如何在每个网站上获得一行,同时包含同一行网站的所有信息?
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.utils.markup import remove_tags
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from bkstr.items import BkstrItem
from scrapy.http import Request
from scrapy.loader.processors import Join
import csv
import sys
class MySpider(CrawlSpider):
name = "book"
my_file = open("callnumberedited3.csv", "rb")
reader = csv.reader(my_file)
start_urls = my_file
custom_settings = {'REDIRECT_ENABLED': False}
handle_httpstatus_list = [301]
download_delay = 0.25
rules = [Rule(LinkExtractor(), follow=True, callback='parse_item')]
def parse_item (self, response):
items = []
varx = response.xpath('//*[@id="fldset-course_1_1"]/div[2]/h2/text()')
for sel in response.xpath('//*[@id="fldset-course_1_1"]/div[2]/h2/text()'):
item = BkstrItem()
item["course"] = varx.extract()[0]
yield item
var1 = response.xpath('//*[@id="fldset-crsmaterialgrp_1"]/div[1]/h3/text()')
for sel in response.xpath('//*[@id="fldset-crsmaterialgrp_1"]/div[1]/h3/text()'):
item = BkstrItem()
item["title1"] = var1.extract()[0]
yield item
var11 = response.xpath('//*[@id="materialAuthor"]/text()')
for sel in response.xpath('//*[@id="materialAuthor"]/text()'):
item = BkstrItem()
item["author1"] = var11.extract()
yield item
var12 = response.xpath('//*[@id="materialISBN"]/text()')
for sel in response.xpath('//*[@id="materialISBN"]/text()'):
item = BkstrItem()
item["isbn1"] = var12.extract()
yield item
var15 = response.xpath('//*[@id="materialCopyrightYear"]/text()')
for sel in response.xpath('//*[@id="materialCopyrightYear"]/text()'):
item = BkstrItem()
item["year1"] = var15.extract()
yield item
var16 = response.xpath('//*[@id="materialPublisher"]/text()')
for sel in response.xpath('//*[@id="materialPublisher"]/text()'):
item = BkstrItem()
item["publisher1"] = var16.extract()
yield item
var2 = response.xpath('//*[@id="fldset-crsmaterialgrp_2"]/div[1]/h3/text()')
for sel in response.xpath('//*[@id="fldset-crsmaterialgrp_2"]/div[1]/h3/text()'):
item = BkstrItem()
item["title2"] = var2.extract()[0]
yield item
var3 = response.xpath('//*[@id="fldset-crsmaterialgrp_3"]/div[1]/h3/text()')
for sel in response.xpath('//*[@id="fldset-crsmaterialgrp_3"]/div[1]/h3/text()'):
item = BkstrItem()
item["title3"] = var3.extract()[0]
yield item
print items