试图编写一个Web抓取程序,它可以正常工作,但是它以错误的顺序输出数据,有人建议这可能是由于我的蜘蛛是递归蜘蛛。那么我该如何将我的蜘蛛变成非递归蜘蛛呢?
# Spider class
class MySpider(Spider):
# Name of Spider
name = 'splash_spider'
# getting all the url + ip address + useragent pairs then request them
def start_requests(self):
# get the file path of the csv file that contains the pairs from the settings.py
with open(self.settings["PROXY_CSV_FILE"], mode="r") as csv_file:
# requests is a list of dictionaries like this -> {url: str, ua: str, ip: str}
requests = process_csv(csv_file)
for i, req in enumerate(requests):
x = len(requests) - i
# Return needed url with set delay of 3 seconds
yield SplashRequest(url=req["url"], callback=self.parse, args={"wait": 3},
# Pair with user agent specified in csv file
headers={"User-Agent": req["ua"]},
# Sets splash_url to whatever the current proxy that goes with current URL is instead of actual splash url
splash_url = req["ip"],
priority = x,
meta={'priority': x} # <- check here!!
)
# Scraping function that will scrape URLs for specified information
def parse(self, response):
# Initialize item to function GameItem located in items.py, will be called multiple times
item = GameItem()
# Initialize saved_name
saved_name = ""
# Extract card category from URL using html code from website that identifies the category. Will be outputted before rest of data
item["Category"] = response.css("span.titletext::text").get()
# For loop to loop through HTML code until all necessary data has been scraped
for game in response.css("tr[class^=deckdbbody]"):
# Initialize saved_name to the extracted card name
saved_name = game.css("a.card_popup::text").get() or saved_name
# Now call item and set equal to saved_name and strip leading '\n' from output
item["Card_Name"] = saved_name.strip()
# Check to see if output is null, in the case that there are two different conditions for one card
if item["Card_Name"] != None:
# If not null than store value in saved_name
saved_name = item["Card_Name"].strip()
# If null then set null value to previous card name since if there is a null value you should have the same card name twice
else:
item["Card_Name"] = saved_name
# Call item again in order to extract the condition, stock, and price using the corresponding html code from the website
item["Condition"] = game.css("td[class^=deckdbbody].search_results_7 a::text").get()
item["Stock"] = game.css("td[class^=deckdbbody].search_results_8::text").get()
item["Price"] = game.css("td[class^=deckdbbody].search_results_9::text").get()
if item["Price"] == None:
item["Price"] = game.css("td[class^=deckdbbody].search_results_9 span[style*='color:red']::text").get()
# Return values
yield item
# Finds next page button
priority = response.meta['priority']
next_page = response.xpath('//a[contains(., "- Next>>")]/@href').get()
# If it exists and there is a next page enter if statement
if next_page is not None:
# Go to next page
yield response.follow(next_page, self.parse, priority=priority, meta={'priority': priority})
更新1
所以问题是我有几个URL,每个URL有多个页面,所以问题是它将刮掉所有的页面1,然后是所有的页面2,以此类推,而不是第一个URL的所有页面,然后是所有的页面。第二个URL等等。因此,我实施了优先级来尝试对其进行修复,但这没有用。现在,尽管一次只执行2次,但不再执行所有URls。来回刮除URL 1的第一页比URL 2的第一页来回刮擦,直到完成一个,然后才获取下一个URL。我在下面有一个示例输出。我删除了一些行,因为每个页面上都有很多数据。每个类别都有一个不同的URL,因此您会注意到它是来回移动的。
{"Category": "Zendikar", "Card_Name": "Crypt of Agadeem", "Condition": "PL", "Stock": "21", "Price": "$2.85"},
{"Category": "Zendikar", "Card_Name": "Crypt Ripper", "Condition": "NM/M", "Stock": "92", "Price": "$0.15"},
{"Category": "Zendikar", "Card_Name": "Day of Judgment", "Condition": "NM/M", "Stock": "Out of Stock", "Price": "$3.49"},
{"Category": "Zendikar", "Card_Name": "Day of Judgment", "Condition": "PL", "Stock": "14", "Price": "$2.85"},
{"Category": "Zendikar", "Card_Name": "Demolish", "Condition": "NM/M", "Stock": "87", "Price": "$0.15"},
{"Category": "Zendikar", "Card_Name": "Desecrated Earth", "Condition": "NM/M", "Stock": "77", "Price": "$0.15"},
{"Category": "Visions", "Card_Name": "Impulse", "Condition": "NM/M", "Stock": "34", "Price": "$0.49"},
{"Category": "Visions", "Card_Name": "Impulse", "Condition": "PL", "Stock": "17", "Price": "$0.39"},
{"Category": "Visions", "Card_Name": "Infantry Veteran", "Condition": "NM/M", "Stock": "65", "Price": "$0.15"},
{"Category": "Zendikar", "Card_Name": "Goblin Ruinblaster", "Condition": "NM/M", "Stock": "54", "Price": "$0.25"},
{"Category": "Zendikar", "Card_Name": "Goblin Shortcutter", "Condition": "NM/M", "Stock": "64", "Price": "$0.15"},
{"Category": "Zendikar", "Card_Name": "Goblin War Paint", "Condition": "NM/M", "Stock": "88", "Price": "$0.15"},
{"Category": "Zendikar", "Card_Name": "Gomazoa", "Condition": "NM/M", "Stock": "59", "Price": "$0.25"},
{"Category": "Zendikar", "Card_Name": "Grappling Hook", "Condition": "NM/M", "Stock": "Out of Stock", "Price": "$0.49"},
{"Category": "Zendikar", "Card_Name": "Graypelt Refuge", "Condition": "NM/M", "Stock": "311", "Price": "$0.25"},
{"Category": "Zendikar", "Card_Name": "Grazing Gladehart", "Condition": "NM/M", "Stock": "159", "Price": "$0.15"},
{"Category": "Visions", "Card_Name": "Knight of the Mists", "Condition": "NM/M", "Stock": "56", "Price": "$0.15"},
{"Category": "Visions", "Card_Name": "Knight of Valor", "Condition": "NM/M", "Stock": "40", "Price": "$0.15"},
{"Category": "Visions", "Card_Name": "Kookus", "Condition": "NM/M", "Stock": "39", "Price": "$0.99"},
{"Category": "Visions", "Card_Name": "Kookus", "Condition": "PL", "Stock": "40", "Price": "$0.79"},
{"Category": "Visions", "Card_Name": "Kyscu Drake", "Condition": "NM/M", "Stock": "59", "Price": "$0.25"},
{"Category": "Visions", "Card_Name": "Lead-Belly Chimera", "Condition": "NM/M", "Stock": "44", "Price": "$0.25"},
{"Category": "Visions", "Card_Name": "Lichenthrope", "Condition": "NM/M", "Stock": "36", "Price": "$0.99"},
{"Category": "Visions", "Card_Name": "Lichenthrope", "Condition": "PL", "Stock": "40", "Price": "$0.79"},
{"Category": "Visions", "Card_Name": "Lightning Cloud", "Condition": "NM/M", "Stock": "39", "Price": "$0.99"},
{"Category": "Visions", "Card_Name": "Lightning Cloud", "Condition": "PL", "Stock": "38", "Price": "$0.79"},
{"Category": "Visions", "Card_Name": "Longbow Archer", "Condition": "NM/M", "Stock": "67", "Price": "$0.29"},