我正在尝试将大量数据写入csv文件。当我尝试普通方法时,它会在1秒内写入50个数据,但经过多处理后,它会在1秒内降至5个数据。
而且我还添加了此代码sys.setrecursionlimit(25000)
。因为没有错误会导致错误。
我可以感觉到我做得不好。正确的方法是什么?
from bs4 import BeautifulSoup
import requests
import lxml
import csv
import cchardet
from multiprocessing import Pool
import sys
import time
sys.setrecursionlimit(25000)
csvfileWrite=open("comments.csv", 'a+', newline='',encoding='utf-8') #declared as a global variable
writer = csv.writer(csvfileWrite, delimiter=';', quotechar='"',
quoting=csv.QUOTE_MINIMAL) #declared as a global variable
def kacYildiz(div): #This function returns a number 0 to 5. Not important.
yildizSayisi=0
yildizYeri=div.find("div",attrs={"class":"RatingPointer-module-1OKF3"})
yildizlar=yildizYeri.find_all("svg")
for yildiz in yildizlar:
sonuc=yildiz.find("path").get("fill")
if(sonuc=="#f28b00"):
yildizSayisi+=1
return yildizSayisi
def takeText(div):
comment=div.find("span",attrs={"itemprop":"description"}).text
return comment
def yorumSayfaSayisi(row): # This function returns a number that how many
pages in the sites comment section. Not important.
yorumKismi="-yorumlari?"
adres=row[0]+yorumKismi
r = requests_session.get(adres)
soup = BeautifulSoup(r.text,"lxml")
sayfaS=soup.find("ul",attrs={"class":"PaginationBar-module-3qhrm"})
sayi=sayfaS.find_all("li")[-1].text
return sayi
def writeToCsv(comments): #writing commets to csv file.
global csvfileWrite
global writer
textToWrite = takeText(comments)
writer.writerow([kacYildiz(comments),textToWrite])
if __name__ == '__main__':
pageNumber=1
requests_session = requests.Session()
comments=list()
csvfile=open('adresler.csv',newline='')
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
for row in reader:
rowNumber=yorumSayfaSayisi(row)
for i in range(1,int(rowNumber)):
comments.clear()
commetAdress="-yorumlari?sayfa={}".format(i)
adress=row[0]+commetAdress
r = requests_session.get(adress)
soup = BeautifulSoup(r.text,"lxml")
page=soup.find_all("div",attrs={"class":"ReviewCard-module-
3Y36S"})
for comment in page:
comments.append(comment)
p = Pool(10)
start = time.process_time()
p.map(writeToCsv, comments)
p.terminate()
p.join()
答案 0 :(得分:0)
一旦使用ThreadPool尝试这种方法
from multiprocessing.pool import ThreadPool
def csvYaz(yorumlar):
global csvfileYaz
global yazici
yazi = yorumAl(yorumlar)
yazici.writerow([kacYildiz(yorumlar),yazi])
# ------main-----
for yorum in yorumSayfasi:
yorumlar.append(yorum)
threads = ThreadPool(10).map(csvYaz, yorumlar)
for zz in threads:
print(zz)