我正在从事一些同时使用多处理和多线程的自动化工作。
设计-
Architecture
我在脚本中所做的工作是,首先在块创建过程中为每个文件划分原始文件,然后在每个过程中再次创建文件块并为每个块创建线程以执行某些操作。
`
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i
def do_job(tasks_to_accomplish, tasks_that_are_done,input_values,urlcheck,tarfficcheck,rankcheck):
#print('Im here')
while True:
try:
file = input_values.get_nowait()
tasks_to_accomplish.get_nowait().start(aws_obj,file,urlcheck,tarfficcheck,rankcheck)
except queue.Empty:
break
else:
#print(res)
tasks_that_are_done.put(str(file) + ' is done by ' + threading.currentThread().getName())
time.sleep(.5)
return True
def do_jobmonth(tasks_to_accomplish, tasks_that_are_done,input_values,urlcheck,tarfficcheck,rankcheck,months):
#print('Im here')
while True:
try:
file = input_values.get_nowait()
tasks_to_accomplish.get_nowait().start(aws_obj,file,urlcheck,tarfficcheck,rankcheck,months)
except queue.Empty:
break
else:
#print(res)
tasks_that_are_done.put(str(file) + ' is done by ' + threading.currentThread().getName())
time.sleep(.5)
return True
def threadstart(file,u,t,r,m=1 ):
consolidatedvalidation=''
consolidatedranking=''
urlcheck=u
tarfficcheck =t #input("Need website traffic Y/N : ")
rankcheck=r#input("Need Ranking Y/N : ")
if tarfficcheck.lower()=='y':
months = m#int(input("Number of Months(1 to 12 ) - "))
filepathname=file#r'C:\Users\Himanshu Gupta\Desktop\Sample\New folder\Sample.csv'
st=os.stat(filepathname)
filepath, filename = os.path.split(filepathname)
size = st.st_size
#lines = file_len(filepathname)
#splitterlines = 10**(int(len(str(lines)))-2)#*(int(str(lines)[0]))*
#print('Splitter Set to lines - ',splitterlines)
#linesize= size/lines
filname,extension=os.path.splitext(file)
last_char=filname[filname.rfind('_')+1:]
fs = FileSplit(file=filepathname, splitsize=size/2, output_dir=filepath)
fs.split(include_header=True)
files = glob.glob(filepath+'/*_'+str(last_char)+'_[0-9]*.csv')
size = len(files)
print('Number of files -',size,' list name -',files)
number_of_task = size
number_of_processes =size
tasks_to_accomplish = Queue()
tasks_that_are_done = Queue()
input_values = Queue()
processes = []
for i in range(number_of_task):
tasks_to_accomplish.put(Rank())
input_values.put(files[i])
# creating processes
for w in range(number_of_processes):
if (tarfficcheck.lower()=='y'):
p = threading.Thread(target=do_jobmonth, args=(tasks_to_accomplish, tasks_that_are_done,input_values,urlcheck,tarfficcheck,rankcheck,months))
else:
p = threading.Thread(target=do_job, args=(tasks_to_accomplish, tasks_that_are_done,input_values,urlcheck,tarfficcheck,rankcheck))
processes.append(p)
p.start()
# completing process
for p in processes:
p.join()
# print the output
while not tasks_that_are_done.empty():
print(tasks_that_are_done.get())
if u.lower()=='y':
validationfiles =glob.glob(filepath+'/Validation_'+last_char+'_[0-9]*.csv')
#print('validationfiles - ',validationfiles)
root =0
for file in validationfiles:
if root==0:
consolidatedvalidation= pd.read_csv(file)
root+=1
else:
#print('Root not zero')
temp = pd.read_csv(file)
consolidatedvalidation=consolidatedvalidation.append(temp, ignore_index=True,sort=False)
os.remove(file)
consolidatedvalidation.to_csv(filepath+'/Validation_'+last_char+'.csv',index=False)
print('Thread Validation file Created')
for i in range(number_of_task):
os.remove((files[i]))
if r =='y':
rankingfiles =glob.glob(filepath+'/Ranking_'+last_char+'_[0-9]*.csv')
root=0
for file in rankingfiles:
if root==0:
consolidatedranking= pd.read_csv(file)
root+=1
else:
#print('Root not zero')
temp = pd.read_csv(file)
consolidatedranking=consolidatedranking.append(temp, ignore_index=True,sort=False)
os.remove(file)
consolidatedranking.to_csv(filepath+'/Ranking_'+last_char+'.csv',index=False)
print(' Thread Ranking file Created')
return True
def do_jobnomonth(tasks_that_are_done,input_values,urlcheck,tarfficcheck,rankcheck):
#print('Im here')
while True:
try:
file = input_values.get_nowait()
threadstart(file,urlcheck,tarfficcheck,rankcheck)
except queue.Empty:
break
else:
#print(res)
tasks_that_are_done.put(str(file) + ' is done by ' + current_process().name)
time.sleep(.5)
return True
def do_jobwithmonth( tasks_that_are_done,input_values,urlcheck,tarfficcheck,rankcheck,months):
#print('Im here')
while True:
try:
file = input_values.get_nowait()
threadstart(file,urlcheck,tarfficcheck,rankcheck,months)
except queue.Empty:
break
else:
#print(res)
tasks_that_are_done.put(str(file) + ' is done by ' + current_process().name)
time.sleep(.5)
return True
def main():
consolidatedvalidation=''
consolidatedranking=''
urlvalidationcheck = input("Need Url Validation traffic Y/N : ")
tarfficcheck = input("Need website traffic Y/N : ")
rankcheck=input("Need Ranking Y/N : ")
if tarfficcheck.lower()=='y':
months = int(input("Number of Months(1 to 12 ) - "))
filepathname=r'C:\Users\Himanshu Gupta\Desktop\Url Validation Script\Input_file.csv'
st=os.stat(filepathname)
filepath, filename = os.path.split(filepathname)
size = st.st_size
lines = file_len(filepathname)
splitterlines = 10**(int(len(str(lines)))-2)#*(int(str(lines)[0]))*
print('Splitter Set to lines - ',splitterlines)
linesize= size/lines
fs = FileSplit(file=filepathname, splitsize=splitterlines*linesize, output_dir=filepath)
fs.split(include_header=True)
files = glob.glob(filepath+'/*_[0-9]*.csv')
size = len(files)
print('Number of files -',size,' list name -',files)
number_of_task = size
number_of_processes =size
tasks_to_accomplish = Queue()
tasks_that_are_done = Queue()
input_values = Queue()
processes = []
for i in range(number_of_task):
#tasks_to_accomplish.put(Rank_Thread())
input_values.put(files[i])
# creating processes
for w in range(number_of_processes):
if (tarfficcheck.lower()=='y'):
p = Process(target=do_jobwithmonth, args=( tasks_that_are_done,input_values,urlvalidationcheck,tarfficcheck,rankcheck,months))
else:
p = Process(target=do_jobnomonth, args=( tasks_that_are_done,input_values,urlvalidationcheck,tarfficcheck,rankcheck))
processes.append(p)
p.start()
# completing process
for p in processes:
p.join()
# print the output
while not tasks_that_are_done.empty():
print(tasks_that_are_done.get())
if urlvalidationcheck== 'y':
validationfiles =glob.glob(filepath+'/Validation*_[0-9]*.csv')
#print('validationfiles - ',validationfiles)
root =0
for file in validationfiles:
if root==0:
consolidatedvalidation= pd.read_csv(file)
root+=1
else:
#print('Root not zero')
temp = pd.read_csv(file)
consolidatedvalidation=consolidatedvalidation.append(temp, ignore_index=True,sort=False)
os.remove(file)
consolidatedvalidation.to_csv(filepath+'/Consolidated_Url_Validation.csv',index=False)
print('Consolidated Validation file Created')
for i in range(number_of_task):
os.remove((files[i]))
if rankcheck== 'y':
rankingfiles =glob.glob(filepath+'/Ranking*_[0-9]*.csv')
root=0
for file in rankingfiles:
if root==0:
consolidatedranking= pd.read_csv(file)
root+=1
else:
#print('Root not zero')
temp = pd.read_csv(file)
consolidatedranking=consolidatedranking.append(temp, ignore_index=True,sort=False)
os.remove(file)
consolidatedranking.to_csv(filepath+'/Consolidated_Url_Ranking.csv',index=False)
print('Consolidated Ranking file Created')
return True
if __name__ == '__main__':
s_time= time.time()
main()
print("Total Time Elapsed --- %s seconds ---" % (time.time() - s_time))
`
我面临的问题是线程在作业完成之前退出,并且如果有错误也不会引发任何错误。