Question

我正在尝试使用python线程抓取多个URL。我认为我的代码结构是不正确的，因为与线程一样，没有线程运行所需的时间几乎相同。文件“List.txt”包含URL。我感谢任何帮助。

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 
(KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'}
q = Queue()

print_lock = threading.Lock()


def threader():
    while True:
        with print_lock:
            properties = q.get()
        create_doneFile(properties)
        q.task_done()


def create_doneFile(properties):

    p=0
    for eachp in properties:
        try:
            p = (p+1)
            url = (properties[p-1])
            print(str(p)+ ' - '+str(url))
            html = requests.get(url, headers=headers).text
            soup = BeautifulSoup(html, "html.parser")
            jsinfo = soup.find_all("script")

            list_of_interest = ['hl.config.value1', 'hl.config.value2']

            d = {}
            for line in jsinfo[9].text.split('\n'):
                if any(word in line for word in list_of_interest):
                    k,v = line.strip().replace('hl.config.','').split(' = ')
                    d[k] = v.strip(';')
            l1 = (d['value1']).replace('"','')
            l2 = (d['value2']).replace('"','')
            tblData = {'URL':url,
                       'value1':l1,
                       'value':l2}
            df = pd.DataFrame(tblData, index=[p])
            with open("ReturnData.csv","a") as csf:
                df.to_csv(csf, header = False)

        except Exception as e:
            continue
            print('there was an error' +str(e))

    print("Complete")
    print('Entire Job Took: ',time.time()-start)

with open("List.txt",'r+') as f:
    properties = f.read().splitlines()
    q.put(properties)

start = time.time()   


for x in range (10):
    t = threading.Thread(target = threader)
    t.daemon = True
    t.start()


q.join()

多线程的代码结构？

0 个答案: