经过数周尝试自己解决这个问题后,我放弃了。请注意,我使用的是 requests_html 而不是 requests ,因为我要抓取的页面是JS呈现的。该脚本可以在没有多线程的情况下正常运行,但是速度很慢。当我尝试包含多线程时,运行此脚本时出现以下错误:
Exception in thread Thread-1:
Traceback (most recent call last):
File "selena_multi.py", line 79, in selena_parse
r.html.render()
File "/home/qorka/.local/lib/python3.6/site-packages/requests_html.py", line 572, in render
self.session.browser # Automatycally create a event loop and browser
File "/home/qorka/.local/lib/python3.6/site-packages/requests_html.py", line 679, in browser
self.loop = asyncio.get_event_loop()
File "/usr/lib/python3.6/asyncio/events.py", line 694, in get_event_loop
return get_event_loop_policy().get_event_loop()
File "/usr/lib/python3.6/asyncio/events.py", line 602, in get_event_loop
% threading.current_thread().name)
RuntimeError: There is no current event loop in thread 'Thread-1'.
这是脚本:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from requests_html import HTMLSession
import time, sys, re, json, threading
class Selena:
def __init__(self):
global shoe_names
self.shoe_model = {}
self.reduced_specs_list = {}
self.shoe_specs_individual = []
self.master_list = []
self.specs_master_list = []
self.counter = 0
self.shoe_names = []
def selena_main(self):
# selenium params
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
driver.get("http://www.goat.com/sneakers")
# # click 'see more'
# click_error_count = 0
# while True:
# try:
# see_more = driver.find_element_by_xpath('//*[@id="root"]/div/div/div[2]/div[1]/div[2]/div[2]/div/div[2]/button/span').click()
# time.sleep(1)
# except Exception as e:
# if click_error_count == 3:
# break
# print("\nFind element exited.\n{}\n".format(e))
# click_error_count += 1
# time.sleep(3)
# pass
# get the html & find the links
goat_html = driver.page_source
shoes = re.findall('<a class="cell" (.+?)</a>', goat_html, re.S)
shoe_string = " ".join(shoes)
self.shoe_names = re.findall('truncate">(.+?)</p>', shoe_string, re.S)
# convert shoes list to string and find links
links = re.findall('href="(.+?)"', shoe_string, re.S)
if len(links) == 0 or len(links) == 12:
print(len(links))
print("No links returned.")
return
print("{} links found.".format(len(links)))
threads = [threading.Thread(target=self.selena_parse, args=(l,)) for l in links]
for thr in threads:
thr.start()
for thr in threads:
thr.join()
def selena_parse(self, links):
print(links)
try:
self.shoe_model.clear()
del self.shoe_specs_individual[:]
self.reduced_specs_list.clear()
session = HTMLSession()
link_string = 'https://www.goat.com{}/available-sizes'.format(links)
r = session.get(link_string, stream=True, timeout=3)
r.raise_for_status()
r.html.render()
goat2 = r.html.find("#root", first=True)
goat2_shoe_data = re.findall('fXQURg">(.+?)</div>', goat2.html, re.S)
goat2_other_data = re.findall('hUbwah">(.+?)</span>', goat2.html, re.S)
# get brand
goat2_brands = re.findall('"nutr-link-brand"(.+?)/a>', goat2.html, re.S)
goat2_brand = re.findall('>(.+?)<', goat2_brands[0], re.S)
# get sku from url
goat2_dash_sep = [u for u in link_string.split('-')]
goat2_dash_sep_pt2 = goat2_dash_sep[-2].split('/')
sku = goat2_dash_sep_pt2[0]
# get image url
goat2_imgs = re.findall('<img(.+?)style', goat2.html, re.S)
goat2_img_url = re.findall('src="(.+?)"', goat2_imgs[0], re.S)
img = goat2_img_url[0]
for n in range(len(goat2_shoe_data) - 1):
if n % 2 == 0:
g2_cleaned = re.findall('<span>(.+?)</span>', goat2_shoe_data[n+1], re.S)
self.shoe_specs_individual.append({goat2_shoe_data[n]:g2_cleaned[0]})
for new in self.shoe_specs_individual:
self.reduced_specs_list.update(new)
self.specs_master_list.append(self.reduced_specs_list)
self.shoe_model.update({"id": self.counter, "url": link_string,
"style": sku, "name": self.shoe_names[self.counter],
"release_date": goat2_other_data[0], "color": goat2_other_data[1], "brand": goat2_brand[0],
"nickname": goat2_other_data[2],
"image_url": img, "specs": self.specs_master_list[self.counter]})
self.master_list.append(json.loads(json.dumps(self.shoe_model)))
self.selena_dump()
r.close()
except Exception as e:
print("No good on {}.\nError: {}.\n".format(links, e))
time.sleep(1)
def selena_dump(self):
# dump into json
all_shoes = {"shoes": self.master_list}
with open('goat.json', 'w') as output:
o = json.dumps(all_shoes, indent=4)
output.write(o)
if __name__ == '__main__':
with open('goat.json', 'w') as new_file:
pass
sel = Selena()
sel.selena_main()
答案 0 :(得分:0)
根据项目 github 页面 (https://github.com/psf/requests-html/issues/155) 上的这个问题,requests_html
库在多线程中运行时不起作用。作者建议您使用 AsyncHTMLSession
而不是 HTMLSession
。