我正在实施一个Spider,该蜘蛛应该从以下网站提取所有内容链接:https://www.accenture.com/us-en/internet-of-things-index 并跟随他们。 x路径表达式很好,但是我没有输出。 请帮忙。
我的蜘蛛:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from accenture.items import AccentureItem
class AccentureSpiderSpider(scrapy.Spider):
name = 'accenture_spider'
start_urls = ['https://www.accenture.com/us-en/internet-of-things-index']
rules = (
Rule(LinkExtractor(restrict_xpaths='.//div[@id="thefutureofartificialintelligence"]//a[@href]|.//div[@id="intelligentautomation"]//a[@href]|//div[@id="cognitiverobotics"]//a[@href]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
loader = ItemLoader(item=AccentureItem(), response=response)
url = response.url
print(url)
content = response.text
loader.add_value('url', url)
loader.add_value('content',content)
yield loader.load_item()
项目:
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst, Join
from bs4 import BeautifulSoup
def cleanhtml(raw_html):
cleantext = BeautifulSoup(raw_html, "lxml").text
return cleantext
class AccentureItem(scrapy.Item):
url = scrapy.Field()
content = scrapy.Field(
input_processor=MapCompose(cleanhtml),
)