我对this link使用scrapy。我想从网站imdb.com抓取信息电影。
当我使用代码XPath
时//td[@class="overview-top"]
我收到了电影的列表信息。
这是我的代码。
import scrapy
import sys
from imbd.items import ImbdItem
class ImbdSpiderSpider(scrapy.Spider):
name = "imbd_spider"
allowed_domains = ["imdb.com"]
start_urls = ()
def parse(self, response):
print response.url
title_movie= response.xpath('//td[@class="overview-top"]/h4/a/text()').extract()
length_title = len(title_movie)
if(length_title == 0):
return
# $x()
# read block
#print block[0].xpath('//*[@id="main"]/div/div[2]/div[4]/table/tbody/tr[1]/td[2]/h4/a').extract()
#print block[1]
#print block[1].xpath('//td[@class="overview-top"]/h4/a/text()').extract()
tree = response.xpath('//td[@class="overview-top"]')
i = 0
for block in tree:
#print table
title = block.xpath('//h4[@itemprop="name"]/a/text()').extract()
# author = block.xpath('//span[@itemprop="director"]/span/a/text()')[i].extract()
# rate = block.xpath('//div[@class="metascore no_ratings"]/strong/text()')[i].extract()
# time = block.xpath('//time[@itemprop="duration"]/text()')[i].extract()
# tag = block.xpath('//span[@itemprop="genre"]/text()').extract()
# des = block.xpath('//div[@class="outline"]/text()')[i].extract()
print title
# print author
# print rate
# print time
# print tag
# print des
#i = i + 1
# page = response.xpath('//div[@class="sort"]/a')
# page2 = page.xpath('//a[text()="Next"]/@href')
# nextpage = "http://www.imdb.com" + page2[0].extract() # /movie-coming-soon..
# yield scrapy.Request(nextpage, self.parse)
def start_requests(self):
start = "http://www.imdb.com/movies-coming-soon/2017-12/"
yield self.make_requests_from_url(start)
我希望终端显示一个标题,但这会显示列表中的所有标题。
我的代码:https://github.com/Takehashi/Scrapy-imbd.com/tree/master
答案 0 :(得分:1)
只需添加"。"在第一个避免重复!
item['title'] = block.xpath('.//h4[@itemprop="name"]/a/text()').extract()
item['author'] = block.xpath('.//span[@itemprop="director"]/span/a/text()').extract()
item['rate'] = block.xpath('.//div[@class="metascore no_ratings"]/strong/text()').extract()
item['time'] = block.xpath('.//time[@itemprop="duration"]/text()').extract()
item['tag'] = block.xpath('.//span[@itemprop="genre"]/text()').extract()
item['des'] = block.xpath('.//div[@class="outline"]/text()').extract()