在我的蜘蛛下面创建我的蜘蛛以获取NecToday.com上的所有链接。
import socket
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class PropertiesItem(scrapy.Item):
# Primary fields
title = scrapy.Field()
url = scrapy.Field()
class NecSpider(CrawlSpider):
name = "NecSpider"
#allowed_domains = ["nectoday.com"]
start_urls = ["http://nectoday.com"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
print(response.url)
item = PropertiesItem()
item["title"] = response.xpath("//title/text()").extract()
item["url"] = response.url
return(item)
此代码开始获取站点上的所有链接。有些网页也有YouTube链接。问题是,在抓取第一个YouTube链接后,它会开始抓取从第一个YouTube链接引用的其他YouTube链接。
我想抓取第一个YouTube链接,但没有其他人。 YouTube只是一个例子。明天也可以是另一个网站。如何实现这一目标?
答案 0 :(得分:2)
为什么不尝试一下这个:
start_urls=["http://nectoday.com"]
def parse(self, response):
#parse whatever you need
for url in response.selector.xpath('//@href').extract():
if 'youtube.com' in url:
yield scrapy.Request(url, callback=self.parse_no_follow)
else:
yield scrapy.Request(url, callback=self.parse)
def parse_no_follow(self, response):
#parse whatever you want and not follow anymore links
答案 1 :(得分:0)
这只会从您允许的域名中删除。
class QuotesSpider(CrawlSpider):
name = "your app name"
n=0
allowed_domains = ['domain']
start_urls=['anywebpage']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
QuotesSpider.n=QuotesSpider.n+1
if (len(response.body)>100):
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.body_width = 0
dd = response.body.decode("utf-8")
init=dd.find("<p>")
while init>0:
end = dd.find("</p>", init)
if end>0:
o=h.handle(dd[init:end+4]+"\n")
supersentences=o.split('\n')