我正在使用scrapy从tripadvisor中搜集航空公司的评论数据。首先,我提取所有id和相应的航空公司名称。我使用以下代码仅抓取包含'Airline_Review'的链接。
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.contrib.spiders import Rule
from scrapy.contrib.linkextractors import LinkExtractor
URL = 'http://www.tripadvisor.com'
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'https://www.tripadvisor.com/Airlines'
]
rules = (
Rule(
LinkExtractor(allow = '.*Airline_Review.*'),
callback = 'parse',
follow= True
)
)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
hxs = HtmlXPathSelector(response)
for url in hxs.select('//a/@href').extract():
if not ( url.startswith('http://') or url.startswith('https://') ):
url= URL + url
url = url.strip()
with open('op.txt', 'a') as f:
f.write(url)
f.write('\n')
yield Request(url, callback=self.parse)
但我仍在我的op.txt文件中获取不包含'Airline_Review'的链接。我的代码中有什么不对?