我想无限期地抓取这些网站 我的刮刀不遵循外部链接 我怎么能这样做?
from scrapy.spiders import CrawlSpider, Rule
from dirbot.settings import *
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field
from urlparse import urlparse
class MyItem(Item):
url= Field()
class someSpider(CrawlSpider):
name = 'infinit'
allowed_domains = ['*']
start_urls = ['https://www.example.com']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
item = MyItem()
item['url'] = []
for link in LxmlLinkExtractor().extract_links(response):
parsed_uri = urlparse(link.url)
url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
insert_table(url)
感谢的