python使用pandas数据框对大型csv文件进行多处理

时间:2019-05-24 05:47:30

标签: python pandas multiprocessing amazon-dynamodb

我们有一个巨大的CSV文件。我们需要使用多任务处理它。我已经尝试过下面的代码。代码运行时,向我显示了进程的创建过程,但是代码在执行时与单个进程花费的时间相同。多进程代码哪里出问题了?我究竟做错了什么?预先感谢您的任何帮助。

enter image description here

   import boto3
   import time
   from multiprocessing import Pool
   import pandas as pd
   from pandas.io import sql
   import json
   import datetime
   import re
   from urlparse import urlparse
   #import psycopg2
   import sys
   import requests
   import json
   import MySQLdb
   import boto3
   import os
   import coltest as col 
   import numpy as np
   dynamodb = boto3.resource('dynamodb')
   table = dynamodb.Table('ct_service')
   api = "https://c.testurl.net/?url="
   con = MySQLdb.connect(host="localhost",    # your host, usually 
   localhost
                 user="root",         # db username
                 passwd="********",  # db password
                 db="dbname")        # 
   cur = con.cursor()
   lis = []
   chunksize = 10 ** 6
   # exit 25_04_2019.csv
   def readfile():
     with open('sample.csv') as f:
      for df in pd.read_csv(f,
                    index_col=False, header=0,
                   usecols=["page_url","user_id", "bid_hour"],chunksize=chunksize
                     ):
    df.dropna(how='any', inplace=True)
    df['user_id'].drop_duplicates(inplace=True)
    df1 = df[['page_url']]#Preserve visit date time & page_url
    df2=df[['user_id']]
    site_q="insert ignore into site (page_url) values (%s)"
    user_q="insert ignore into users (beeswax_user_id) values (%s)"
    try:
      cur.executemany(user_q, df2.values.tolist())
      con.commit()
    except Exception as e:
      print(e)
    con.rollback()
    #Bulk Insert Page_url
    try:
      cur.executemany(site_q, df1.values.tolist())
      con.commit()
    except Exception as e:
      print(e)
    con.rollback()
    return df

 def main_loop(df):

   for i in df.values.tolist():
      print(i[0], i[1], i[2])
      page_url_escaped=str(i[2])
    #page_url_escaped=repr(page_url_escaped).replace('\\','\\')[1:-1]
      if not urlparse(page_url_escaped).scheme:
            page_url_escaped = 'http://' + page_url_escaped
      try:
           response = table.get_item(
           Key={
        'page_url': page_url_escaped
       })
           item = response['Item']['keywords']
           print item
      except KeyError:
           lis.append(page_url_escaped)
       #response = requests.get(api+page_url_escaped)#Replace tis with DDB Connection
      except Exception as e:
           log_url_ex = '\nError URL-->> '+str(e)+str(page_url_escaped)
           print(log_url_ex)
       #exit(1)
    page_url_escaped=page_url_escaped.replace("'", "''")
      site_id_q="select site_id from site where page_url='"+page_url_escaped+"'"
      print(site_id_q)
      cur.execute(site_id_q)
      (site_id)=str( cur.fetchone())
      site_id=site_id.replace(',', '')
      visit_q="insert ignore into visit (user_id, site_id, visit_date_time) values ((select user_id from users where beeswax_user_id=\""+str(i[1])+"\"),"+str(site_id)+",\""+str(i[0])+"\")"
      print(visit_q)
      try:
          cur.execute(visit_q)
          con.commit()
      except Exception as e:
         print(e)
         con.rollback()


      data = item
      empty = []
      print(len(data))
      if(len(data)>0):
         for key, value in data.items():

        # print(points)
             keyword_contains_q="insert ignore into contains (site_id, keyword_id, points) values ("+str(site_id)+",(select id from keywords where keyword=\""+key+"\"), \""+str(value)+"\")"
             print(keyword_contains_q)
             try:
                 cur.execute(keyword_contains_q)
                 con.commit()
             except Exception as e:
                print(e)
                con.rollback()
      else:
         empty.append(page_url_escaped)#Store for later processing as backlog

   print(empty)
   print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
   df = readfile()

   if __name__ == "__main__":

     p = Pool(processes = 10)
     start = time.time()
     try:
      async_result = p.map_async(main_loop(df),chunksize=1)
     except:
      pass


     p.close()
     p.join()
     print("Complete")
     end = time.time()

0 个答案:

没有答案