带requests-html

时间:2018-12-09 21:28:45

标签: python multithreading web-scraping python-requests-html

经过数周尝试自己解决这个问题后,我放弃了。请注意,我使用的是 requests_html 而不是 requests ,因为我要抓取的页面是JS呈现的。该脚本可以在没有多线程的情况下正常运行,但是速度很慢。当我尝试包含多线程时,运行此脚本时出现以下错误:

Exception in thread Thread-1:
Traceback (most recent call last):
  File "selena_multi.py", line 79, in selena_parse
    r.html.render()
  File "/home/qorka/.local/lib/python3.6/site-packages/requests_html.py", line 572, in render
    self.session.browser  # Automatycally create a event loop and browser
  File "/home/qorka/.local/lib/python3.6/site-packages/requests_html.py", line 679, in browser
    self.loop = asyncio.get_event_loop()
  File "/usr/lib/python3.6/asyncio/events.py", line 694, in get_event_loop
    return get_event_loop_policy().get_event_loop()
  File "/usr/lib/python3.6/asyncio/events.py", line 602, in get_event_loop
    % threading.current_thread().name)
RuntimeError: There is no current event loop in thread 'Thread-1'.

这是脚本:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from requests_html import HTMLSession
import time, sys, re, json, threading


class Selena:
    def __init__(self):
    global shoe_names

    self.shoe_model = {}
    self.reduced_specs_list = {}
    self.shoe_specs_individual = []
    self.master_list = []
    self.specs_master_list = []
    self.counter = 0
    self.shoe_names = []


def selena_main(self):
    # selenium params
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    driver = webdriver.Chrome(options=chrome_options)
    driver.get("http://www.goat.com/sneakers")

    # # click 'see more'
    # click_error_count = 0 
    # while True:
    #     try:
    #         see_more = driver.find_element_by_xpath('//*[@id="root"]/div/div/div[2]/div[1]/div[2]/div[2]/div/div[2]/button/span').click()
    #         time.sleep(1)

    #     except Exception as e:
    #         if click_error_count == 3:
    #             break
    #         print("\nFind element exited.\n{}\n".format(e))
    #         click_error_count += 1
    #         time.sleep(3)
    #         pass

    # get the html & find the links
    goat_html = driver.page_source
    shoes = re.findall('<a class="cell" (.+?)</a>', goat_html, re.S)
    shoe_string = " ".join(shoes)
    self.shoe_names = re.findall('truncate">(.+?)</p>', shoe_string, re.S)

    # convert shoes list to string and find links
    links = re.findall('href="(.+?)"', shoe_string, re.S)

    if len(links) == 0 or len(links) == 12:
        print(len(links))
        print("No links returned.")
        return

    print("{} links found.".format(len(links)))

    threads = [threading.Thread(target=self.selena_parse, args=(l,)) for l in links]
    for thr in threads:
        thr.start()

    for thr in threads:
        thr.join()


def selena_parse(self, links):
    print(links)

    try:
        self.shoe_model.clear()
        del self.shoe_specs_individual[:]
        self.reduced_specs_list.clear()
        session = HTMLSession()
        link_string = 'https://www.goat.com{}/available-sizes'.format(links)
        r = session.get(link_string, stream=True, timeout=3)
        r.raise_for_status()
        r.html.render()

        goat2 = r.html.find("#root", first=True)
        goat2_shoe_data = re.findall('fXQURg">(.+?)</div>', goat2.html, re.S)
        goat2_other_data = re.findall('hUbwah">(.+?)</span>', goat2.html, re.S)

        # get brand
        goat2_brands = re.findall('"nutr-link-brand"(.+?)/a>', goat2.html, re.S)
        goat2_brand = re.findall('>(.+?)<', goat2_brands[0], re.S)

        # get sku from url
        goat2_dash_sep = [u for u in link_string.split('-')]
        goat2_dash_sep_pt2 = goat2_dash_sep[-2].split('/')
        sku = goat2_dash_sep_pt2[0]

        # get image url
        goat2_imgs = re.findall('<img(.+?)style', goat2.html, re.S)
        goat2_img_url = re.findall('src="(.+?)"', goat2_imgs[0], re.S)
        img = goat2_img_url[0]

        for n in range(len(goat2_shoe_data) - 1):
            if n % 2 == 0:
                g2_cleaned = re.findall('<span>(.+?)</span>', goat2_shoe_data[n+1], re.S)
                self.shoe_specs_individual.append({goat2_shoe_data[n]:g2_cleaned[0]})

        for new in self.shoe_specs_individual:
            self.reduced_specs_list.update(new)

        self.specs_master_list.append(self.reduced_specs_list)

        self.shoe_model.update({"id": self.counter, "url": link_string, 
        "style": sku, "name": self.shoe_names[self.counter], 
        "release_date": goat2_other_data[0], "color": goat2_other_data[1], "brand": goat2_brand[0], 
        "nickname": goat2_other_data[2],
        "image_url": img, "specs": self.specs_master_list[self.counter]})

        self.master_list.append(json.loads(json.dumps(self.shoe_model)))

        self.selena_dump()
        r.close()

    except Exception as e:
        print("No good on {}.\nError: {}.\n".format(links, e))
        time.sleep(1)



def selena_dump(self):
    # dump into json
    all_shoes = {"shoes": self.master_list}
    with open('goat.json', 'w') as output:
        o = json.dumps(all_shoes, indent=4)
        output.write(o)


if __name__ == '__main__':
    with open('goat.json', 'w') as new_file:
        pass

    sel = Selena()
    sel.selena_main()

1 个答案:

答案 0 :(得分:0)

根据项目 github 页面 (https://github.com/psf/requests-html/issues/155) 上的这个问题,requests_html 库在多线程中运行时不起作用。作者建议您使用 AsyncHTMLSession 而不是 HTMLSession