我正在尝试使用python线程抓取多个URL。我认为我的代码结构是不正确的,因为与线程一样,没有线程运行所需的时间几乎相同。文件“List.txt”包含URL。我感谢任何帮助。
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17
(KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'}
q = Queue()
print_lock = threading.Lock()
def threader():
while True:
with print_lock:
properties = q.get()
create_doneFile(properties)
q.task_done()
def create_doneFile(properties):
p=0
for eachp in properties:
try:
p = (p+1)
url = (properties[p-1])
print(str(p)+ ' - '+str(url))
html = requests.get(url, headers=headers).text
soup = BeautifulSoup(html, "html.parser")
jsinfo = soup.find_all("script")
list_of_interest = ['hl.config.value1', 'hl.config.value2']
d = {}
for line in jsinfo[9].text.split('\n'):
if any(word in line for word in list_of_interest):
k,v = line.strip().replace('hl.config.','').split(' = ')
d[k] = v.strip(';')
l1 = (d['value1']).replace('"','')
l2 = (d['value2']).replace('"','')
tblData = {'URL':url,
'value1':l1,
'value':l2}
df = pd.DataFrame(tblData, index=[p])
with open("ReturnData.csv","a") as csf:
df.to_csv(csf, header = False)
except Exception as e:
continue
print('there was an error' +str(e))
print("Complete")
print('Entire Job Took: ',time.time()-start)
with open("List.txt",'r+') as f:
properties = f.read().splitlines()
q.put(properties)
start = time.time()
for x in range (10):
t = threading.Thread(target = threader)
t.daemon = True
t.start()
q.join()