我尝试在 start_url 中创建多个 url,但在许多参考文献中不存在如何在 crapy 中创建多个 URL,
多个网址如 tokopedia.com、olx.co.id 等
答案 0 :(得分:0)
试试这个。
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain
class WebsiteSpider(Spider):
name = 'my_spider'
allowed_domains = ['example1.com','example2.com']
start_urls = ['https://www.example1.com','https://www.example2.com']
# refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
lstA = doc.listA(
url=url["url"]) # Get link data for subsequent crawling
data = [{"title": doc.title.text}] # Get target data
return {"Urls": lstA, "Data": data} # Return data to framework
SimplifiedMain.startThread(WebsiteSpider())