我是使用Scrapy和XPath的新手。我用XPath编写了一个蜘蛛。它正在检索数据,但数据全部搞砸了。代码有什么问题?大多数字段似乎返回了一些内容,但数据位于错误的列中。一切都搞砸了。标题不会出现在“标题”中。专栏等。
将数据导出到csv文件。
import scrapy
class suumotest(scrapy.Spider):
name="newselector"
start_urls = [
'https://suumo.jp/jj/chintai/ichiran/FR301FC005/?tc=0401303&tc=0401304&ar=020&bs=040&pn=1'
]
def parse(self, response):
#for following property link
for href in response.xpath('//a[@class="js-cassetLinkHref"]//@href').extract():
yield scrapy.Request(response.urljoin(href),
callback=self.parse_info)
#following pagination links
#defining parser to extract data
def parse_info(self, response):
def extract_with_xpath(query):
return response.xpath(query).extract(' ').strip()
yield {
'Title':extract_with_xpath('/html/body/div[4]/div[3]/h1/text()'),
'Rent':extract_with_xpath('/html/body/div[4]/div[3]/div[1]/div[2]/div[1]/table/tbody/tr/td[1]/div/div[1]/span/text() | /html/body/div[4]/div[3]/div[1]/div[2]/div[1]/table/tbody/tr/td[1]/div/div[2]/span/text()'),
'Floor Details':extract_with_xpath('/html/body/div[4]/div[3]/div[1]/div[2]/div[1]/table/tbody/tr/td[3]/div/div[1]/text()|/html/body/div[4]/div[3]/div[1]/div[2]/div[1]/table/tbody/tr/td[3]/div/div[2]/text()'),
'Prop Address':extract_with_xpath('/html/body/div[4]/div[3]/div[1]/div[2]/div[1]/table/tbody/tr/td[5]/div/div[1]/text()'),
#Beginning Of Property Description
'Property_ Floor Details':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[1]/td[1]/text()'),
'Property_ Construction':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[1]/td[2]/text()'),
'Property_ Story':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[2]/td[1]/text()'),
'Property_ Construction Date':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[2]/td[2]/text()'),
'Property_ Insurance':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[3]/td[1]/text()'),
'Property_ Parking Lot':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[3]/td[2]/text()'),
'Property_ Moving In':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[4]/td[1]/text()'),
'Property_ Transaction Type':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[4]/td[2]/text()'),
'Property_ Conditions':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[5]/td[1]'),
'Property_ Handling Store Property Code':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[5]/td[2]/text()'),
'Property_ Sumo Property Code':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[6]/td[1]/text()'),
'Property_ Total Number Of Houses':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[6]/td[2]/text()'),
'Property_ Acting As A Guarantor':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[7]/td/ul/li/text()'),
'Property_ Area Information':extract_with_xpath('/html/body/div[4]/div[4]/div[5]/table/tbody/tr[9]/td[1]/ul/text()'),
#Property Description End
#Handling Store Information
'Handling Store_ Name':extract_with_xpath('/html/body/div[4]/div[4]/div[7]/div/div/p[1]/a/img/text()'),
'Handling Store_ Contact':extract_with_xpath('/html/body/div[4]/div[4]/div[7]/div/div/p[2]/span[1]/text()'),
'Handling Store_ Address':extract_with_xpath('/html/body/div[4]/div[4]/div[7]/div/div/div/ul/li[1]/text()'),
'Handling Store_ Opening Hours':extract_with_xpath('/html/body/div[4]/div[4]/div[7]/div/div/div/ul/li[2]/text()'),
'Handling Store_ Closed':extract_with_xpath('/html/body/div[4]/div[4]/div[7]/div/div/div/ul/li[3]/text()'),
'Handling Store_ License Number':extract_with_xpath('/html/body/div[4]/div[4]/div[7]/div/div/div/ul/li[4]/text()')
}