我要在我的scrapy程序中添加第二个Spider,我想将其与其他Spider放在相同的python文件中,并使用相同的设置和所有内容,但是我很难弄清楚这一点,因为当我去创建新蜘蛛,它会为新蜘蛛创建新设置,等等。
# Spider class
class MySpider(Spider):
# Name of Spider
name = 'splash_spider'
# getting all the url + ip address + useragent pairs then request them
def start_requests(self):
# get the file path of the csv file that contains the pairs from the settings.py
with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
# requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
requests = process_csv(csv_file)
for i, req in enumerate(requests):
x = len(requests) - i
# Return needed url with set delay of 3 seconds
yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
# Pair with user agent specified in csv file
headers={"User-Agent": req["ua"]},
# Sets splash_url to whatever the current proxy that goes with current URL is instead of actual splash url
splash_url = req["ip"],
priority = x,
meta={'priority': x} # <- check here!!
)
# Scraping function that will scrape URLs for specified information
def parse(self, response):
# parse for first spider
#class LoginSpider(scrapy.Spider):
name = 'login_spider'
my_urls = ['https://www.starcitygames.com/myaccount/']
def start_requests(self):
for url in self.my_urls:
yield Request(url, meta={'proxy': 'http://199.89.192.97::8050'})
def parse(self, response):
# parse for second spider
答案 0 :(得分:0)
唯一的方法是拥有一个BaseSpider
类并在那里选择custom_settings
,然后创建从该BaseSpider
继承的2个蜘蛛。
class BaseSpider(scrapy.Spider):
custom_settings = {
'CONCURRENT_REQUESTS': 100
# and other settings
}
class MySpider(BaseSpider):
# Name of Spider
name = 'splash_spider'
# getting all the url + ip address + useragent pairs then request them
def start_requests(self):
# get the file path of the csv file that contains the pairs from the settings.py
with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
# requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
requests = process_csv(csv_file)
for i, req in enumerate(requests):
x = len(requests) - i
# Return needed url with set delay of 3 seconds
yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
# Pair with user agent specified in csv file
headers={"User-Agent": req["ua"]},
# Sets splash_url to whatever the current proxy that goes with current URL is instead of actual splash url
splash_url = req["ip"],
priority = x,
meta={'priority': x} # <- check here!!
)
# Scraping function that will scrape URLs for specified information
def parse(self, response):
# parse for first spider
class LoginSpider(BaseSpider):
name = 'login_spider'
my_urls = ['https://www.starcitygames.com/myaccount/']
def start_requests(self):
for url in self.my_urls:
yield Request(url, meta={'proxy': 'http://199.89.192.97::8050'})
def parse(self, response):
# parse for second spider
答案 1 :(得分:0)
我的设置文件中有一个全局设置字典,然后使用每个蜘蛛的自定义设置来更新该字典。
settings.py
Text("Hello world!").modifier(RedTitle())
spiders.py
global main_settings
main_settings = {
'ITEM_PIPELINES': {
'pipelines.MainPipeline': 90,
},
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_IP': 100,
'ROBOTSTXT_OBEY': False,
'CONCURRENT_ITEMS': 300,
'REACTOR_THREADPOOL_MAXSIZE': 150,
'LOG_LEVEL': 'INFO',
'RETRY_ENABLED': False,
'DONT_RETY': True,
'RETRY_TIMES': 0,
'COOKIES_ENABLED': False,
'REDIRECT_MAX_TIMES': 0,
'DOWNLOAD_FAIL_ON_DATALOSS': False,
'DNS_TIMEOUT': 60,
'LOG_STDOUT': True,
'DOWNLOADER_STATS': False
}