我想提取以下字段:电影,导演,演员'名称 在页面allocine.fr
这将帮助我制作模板以获取更多碎片。
这是我糟糕的工作代码(在蜘蛛目录内)
from scrapy.contrib.spiders import CrawlSpider, Rule
from cinefil.items import Article
#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor ==> depreciated
from scrapy.linkextractors import LinkExtractor
from scrapy import log
class CinefilSpider(CrawlSpider):
name="cinefil"
allowed_domains = ["allocine.fr"]
start_urls = ["http://www.allocine.fr/film/fichefilm_gen_cfilm=29007.html"]
rules = [
Rule(LinkExtractor(allow=('(/film/)((?!:).)*$'),), callback="parse_item", follow=False)
]
def parse_item(self, response):
ROOTPATH = '//div[@class="meta-body-item"]'
item = Article()
casiers = response.xpath(ROOTPATH).extract()
for matos in casiers:
print("\n----- ------ ------ -------- ---------")
print(matos)
return item
答案 0 :(得分:1)
电影名称
#get from <div class="titlebar-title titlebar-title-lg">
>>> movie=response.xpath('//div[@class="titlebar-title titlebar-title-lg"]/text()').extract_first()
>>> movie
u'Spider-Man'
导演姓名
#start from
#<span itemprop="director">
#<a>
#<span itemprop="name">
>>> director=response.xpath('//span[@itemprop="director"]/a/span[@itemprop="name"]/text()').extract()
>>> director
u'Sam Raimi'
演员姓名
#Take the word "Avec" as landmark and get its siblings <spans>
>>> movie_stars=response.xpath('//span[contains(text(),"Avec")]/following-sibling::span/text()').extract()
>>> movie_stars
[u'Tobey Maguire', u'Willem Dafoe', u'Kirsten Dunst', u' plus ']
#remove last item 'plus'
>>> movie_stars.pop()
u' plus '
>>> movie_stars
[u'Tobey Maguire', u'Willem Dafoe', u'Kirsten Dunst']
items.py 应声明为:
import scrapy
class Movie(scrapy.Item):
name = scrapy.Field()
director = scrapy.Field()
actors = scrapy.Field()