我们有一个巨大的CSV文件。我们需要使用多任务处理它。我已经尝试过下面的代码。代码运行时,向我显示了进程的创建过程,但是代码在执行时与单个进程花费的时间相同。多进程代码哪里出问题了?我究竟做错了什么?预先感谢您的任何帮助。
import boto3
import time
from multiprocessing import Pool
import pandas as pd
from pandas.io import sql
import json
import datetime
import re
from urlparse import urlparse
#import psycopg2
import sys
import requests
import json
import MySQLdb
import boto3
import os
import coltest as col
import numpy as np
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('ct_service')
api = "https://c.testurl.net/?url="
con = MySQLdb.connect(host="localhost", # your host, usually
localhost
user="root", # db username
passwd="********", # db password
db="dbname") #
cur = con.cursor()
lis = []
chunksize = 10 ** 6
# exit 25_04_2019.csv
def readfile():
with open('sample.csv') as f:
for df in pd.read_csv(f,
index_col=False, header=0,
usecols=["page_url","user_id", "bid_hour"],chunksize=chunksize
):
df.dropna(how='any', inplace=True)
df['user_id'].drop_duplicates(inplace=True)
df1 = df[['page_url']]#Preserve visit date time & page_url
df2=df[['user_id']]
site_q="insert ignore into site (page_url) values (%s)"
user_q="insert ignore into users (beeswax_user_id) values (%s)"
try:
cur.executemany(user_q, df2.values.tolist())
con.commit()
except Exception as e:
print(e)
con.rollback()
#Bulk Insert Page_url
try:
cur.executemany(site_q, df1.values.tolist())
con.commit()
except Exception as e:
print(e)
con.rollback()
return df
def main_loop(df):
for i in df.values.tolist():
print(i[0], i[1], i[2])
page_url_escaped=str(i[2])
#page_url_escaped=repr(page_url_escaped).replace('\\','\\')[1:-1]
if not urlparse(page_url_escaped).scheme:
page_url_escaped = 'http://' + page_url_escaped
try:
response = table.get_item(
Key={
'page_url': page_url_escaped
})
item = response['Item']['keywords']
print item
except KeyError:
lis.append(page_url_escaped)
#response = requests.get(api+page_url_escaped)#Replace tis with DDB Connection
except Exception as e:
log_url_ex = '\nError URL-->> '+str(e)+str(page_url_escaped)
print(log_url_ex)
#exit(1)
page_url_escaped=page_url_escaped.replace("'", "''")
site_id_q="select site_id from site where page_url='"+page_url_escaped+"'"
print(site_id_q)
cur.execute(site_id_q)
(site_id)=str( cur.fetchone())
site_id=site_id.replace(',', '')
visit_q="insert ignore into visit (user_id, site_id, visit_date_time) values ((select user_id from users where beeswax_user_id=\""+str(i[1])+"\"),"+str(site_id)+",\""+str(i[0])+"\")"
print(visit_q)
try:
cur.execute(visit_q)
con.commit()
except Exception as e:
print(e)
con.rollback()
data = item
empty = []
print(len(data))
if(len(data)>0):
for key, value in data.items():
# print(points)
keyword_contains_q="insert ignore into contains (site_id, keyword_id, points) values ("+str(site_id)+",(select id from keywords where keyword=\""+key+"\"), \""+str(value)+"\")"
print(keyword_contains_q)
try:
cur.execute(keyword_contains_q)
con.commit()
except Exception as e:
print(e)
con.rollback()
else:
empty.append(page_url_escaped)#Store for later processing as backlog
print(empty)
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
df = readfile()
if __name__ == "__main__":
p = Pool(processes = 10)
start = time.time()
try:
async_result = p.map_async(main_loop(df),chunksize=1)
except:
pass
p.close()
p.join()
print("Complete")
end = time.time()