我目前有一个脚本用于生成excel输出文件,使用pandas df。我运行脚本5次 - 只更改了我用的组列 - 并将所有5张表格附加到主文件中#39;手动。我想知道如何使用5种不同的groupby函数自动循环我的脚本,同时为输出创建5个单独的xlsx表。
这些是我通常粘贴在' ###列重命名,NaN替换& DataFrame列添加'评价:
grouped = df.groupby(['customer_account', 'CounterPartyID'])
grouped = df.groupby(['customer_account', 'CounterPartyID', 'symbol'])
grouped = df.groupby(['customer_account', 'CounterPartyID', 'Providers', 'symbol'])
grouped = df.groupby(['Providers', 'customer_account'])
grouped = df.groupby(['Providers', 'symbol'])
import pandas as pd
import numpy as np
import csv
import time
import glob
import datetime
import re
import sys
import os
from dateutil import relativedelta
from xlsxwriter.utility import xl_rowcol_to_cell
'''This is where I find the file with the compiled data and add the needed columns to the df'''
### File Finding Stuff
file_names = sorted(glob.glob(r'T:\Tom\Scripts\\' + '*_fillssideclient.csv'), reverse=True)
file = file_names[0]
date = os.path.basename(file)[0:8]
#file = "20151215_fillssideclient.csv" ### For manual file pulls
df = pd.read_csv(file)
### Column Renaming, NaN Replacing & DataFrame Column Additions
df.rename(columns={'provider':'Providers'}, inplace=True)
df = df.replace(np.nan,'All Tags', regex=True)
df['five_avg'] = df.iloc[:, 30:40].sum(axis=1).astype('int64') / 10 #Added column at end of df for 5s avg
df['ten_avg'] = df.iloc[:, 30:50].sum(axis=1).astype('int64') / 20 #Added column at end of df for 10s avg
df['twenty_avg'] = df.iloc[:, 30:70].sum(axis=1).astype('int64') / 40 #Added column at end of df for 20s avg
#This is the primary function that I need to have my 5 'groupby' variables loop through and create 5 sheets'''
### Primary DataFrame Calculations
filled_total = df['filled'].sum()
order_total = grouped['filled'].count()
total_tickets = grouped['filled'].sum()
share = total_tickets / filled_total
fill_rate = total_tickets / order_total
total_size = grouped['fill_size'].sum()
avg_size = total_size / total_tickets
### One Second Calculations
one_toxicity = grouped.apply(lambda x: x['filled'][x['1000'] < -25].sum()) / total_tickets
one_average = grouped.apply(lambda x: x[x['filled'] == 1]['1000'].mean())
one_low = grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.25))
one_med = grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.50))
one_high = grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.75))
### Five Second Calculations
#five_toxicity = grouped.apply(lambda x: x['filled'][x['5000'] < -25].sum()) / total_tickets
five_average = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].mean())
five_low = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.25))
five_med = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.50))
five_high = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.75))
#five_std = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].std())
### Ten Second Calculations
#ten_toxicity = grouped.apply(lambda x: x['filled'][x['10000'] < -25].sum()) / total_tickets
ten_average = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].mean())
ten_low = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.25))
ten_med = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.50))
ten_high = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.75))
#ten_std = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].std())
### Twenty Second Calculations
#twenty_toxicity = grouped.apply(lambda x: x['filled'][x['20000'] < -50].sum()) / total_tickets
twenty_avg = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].mean())
twenty_low = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.25))
twenty_med = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.50))
twenty_high = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.75))
#twenty_std = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].std())
### Column Formatting
#comma_fmt = workbook.add_format({'num_format': '#,##0'})
#money_fmt = workbook.add_format({'num_format': '$#,##0.000'})
#percent_fmt = workbook.add_format({'num_format': '0.0%'})
#Still need to figure out how to customize column width, column format and conditional formatting'''
list_of_lists = [
['Trades', total_tickets],
['Share %', share],
['Fill Rate', fill_rate],
['Total Size', total_size],
['Avg Size', avg_size],
['1s Toxic', one_toxicity],
['1s Avg', one_average],
['1s 25th', one_low],
['1s 50th', one_med],
['1s 75th', one_high],
['5s Avg', five_average],
['5s 25th', five_low],
['5s 50th', five_med],
['5s 75th', five_high],
['10s Avg', ten_average],
['10s 25th', ten_low],
['10s 50th', ten_med],
['10s 75th', ten_high],
['20s Avg', twenty_avg],
['20s 25th', twenty_low],
['20s 50th', twenty_med],
['20s 75th', twenty_high]
]
result = pd.concat([lst[1] for lst in list_of_lists], axis=1)
result.columns = [lst[0] for lst in list_of_lists]
result = result[result.Trades > 0] # Removes results that are less than 1...use '!= 0' to remove only 0 trades
# This is where I find the output location, declare my 'groupby' variables and execute the script
writer = pd.ExcelWriter(date + '_counterparty_monthly.xlsx', engine='xlsxwriter')
result.to_excel(writer, sheet_name='All Trades')
workbook = writer.book
worksheet = writer.sheets['All Trades']
worksheet.set_zoom(80)
#Worksheet and Print Options
worksheet.hide_gridlines(2)
worksheet.fit_to_pages(1, 1)
writer.save()
答案 0 :(得分:2)
IIUC你可以添加列列表然后使用for循环。最后,您可以为工作表名称添加数字:
col = [['customer_account', 'CounterPartyID'],
['customer_account', 'CounterPartyID', 'symbol'],
['customer_account', 'CounterPartyID', 'Providers', 'symbol'],
['Providers', 'customer_account'],
['Providers', 'symbol']]
for i, col in enumerate(col):
print col
print i
#grouped = df.groupby(col)
sheetname = 'All Trades-' + str(i)
print sheetname
#['customer_account', 'CounterPartyID']
#0
#All Trades-0
#['customer_account', 'CounterPartyID', 'symbol']
#1
#All Trades-1
#['customer_account', 'CounterPartyID', 'Providers', 'symbol']
#2
#All Trades-2
#['Providers', 'customer_account']
#3
#All Trades-3
#['Providers', 'symbol']
#4
#All Trades-4
在第133行使用变量sheetname
:
#add sheet name
result.to_excel(writer, sheet_name=sheetname)
workbook = writer.book
#add sheet name
worksheet = writer.sheets[sheetname]
worksheet.set_zoom(80)
您只能打开并保存一次excel文件:
# This is where I find the output location, declare my 'groupby' variables and execute the script
writer = pd.ExcelWriter(date + '_counterparty_monthly.xlsx', engine='xlsxwriter')
for i, col in enumerate(col):
#print col
#print i
grouped = df.groupby(col)
.
.
.
#Worksheet and Print Options
worksheet.hide_gridlines(2)
worksheet.fit_to_pages(1, 1)
writer.save()
所有在一起:
import pandas as pd
import numpy as np
import csv
import time
import glob
import datetime
import re
import sys
import os
from dateutil import relativedelta
from xlsxwriter.utility import xl_rowcol_to_cell
'''This is where I find the file with the compiled data and add the needed columns to the df'''
### File Finding Stuff
file_names = sorted(glob.glob(r'T:\Tom\Scripts\\' + '*_fillssideclient.csv'), reverse=True)
file = file_names[0]
date = os.path.basename(file)[0:8]
#file = "20151215_fillssideclient.csv" ### For manual file pulls
df = pd.read_csv(file)
### Column Renaming, NaN Replacing & DataFrame Column Additions
df.rename(columns={'provider':'Providers'}, inplace=True)
df = df.replace(np.nan,'All Tags', regex=True)
df['five_avg'] = df.iloc[:, 30:40].sum(axis=1).astype('int64') / 10 #Added column at end of df for 5s avg
df['ten_avg'] = df.iloc[:, 30:50].sum(axis=1).astype('int64') / 20 #Added column at end of df for 10s avg
df['twenty_avg'] = df.iloc[:, 30:70].sum(axis=1).astype('int64') / 40 #Added column at end of df for 20s avg
col = [['customer_account', 'CounterPartyID'],
['customer_account', 'CounterPartyID', 'symbol'],
['customer_account', 'CounterPartyID', 'Providers', 'symbol'],
['Providers', 'customer_account'],
['Providers', 'symbol']]
# This is where I find the output location, declare my 'groupby' variables and execute the script
writer = pd.ExcelWriter(date + '_counterparty_monthly.xlsx', engine='xlsxwriter')
for i, col in enumerate(col):
#print col
#print i
grouped = df.groupby(col)
sheetname = 'All Trades-' + str(i)
#print sheetname
#This is the primary function that I need to have my 5 'groupby' variables loop through and create 5 sheets'''
### Primary DataFrame Calculations
filled_total = df['filled'].sum()
order_total = grouped['filled'].count()
total_tickets = grouped['filled'].sum()
share = total_tickets / filled_total
fill_rate = total_tickets / order_total
total_size = grouped['fill_size'].sum()
avg_size = total_size / total_tickets
### One Second Calculations
one_toxicity = grouped.apply(lambda x: x['filled'][x['1000'] < -25].sum()) / total_tickets
one_average = grouped.apply(lambda x: x[x['filled'] == 1]['1000'].mean())
one_low = grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.25))
one_med = grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.50))
one_high = grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.75))
### Five Second Calculations
#five_toxicity = grouped.apply(lambda x: x['filled'][x['5000'] < -25].sum()) / total_tickets
five_average = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].mean())
five_low = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.25))
five_med = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.50))
five_high = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.75))
#five_std = grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].std())
### Ten Second Calculations
#ten_toxicity = grouped.apply(lambda x: x['filled'][x['10000'] < -25].sum()) / total_tickets
ten_average = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].mean())
ten_low = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.25))
ten_med = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.50))
ten_high = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.75))
#ten_std = grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].std())
### Twenty Second Calculations
#twenty_toxicity = grouped.apply(lambda x: x['filled'][x['20000'] < -50].sum()) / total_tickets
twenty_avg = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].mean())
twenty_low = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.25))
twenty_med = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.50))
twenty_high = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.75))
#twenty_std = grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].std())
### Column Formatting
#comma_fmt = workbook.add_format({'num_format': '#,##0'})
#money_fmt = workbook.add_format({'num_format': '$#,##0.000'})
#percent_fmt = workbook.add_format({'num_format': '0.0%'})
#Still need to figure out how to customize column width, column format and conditional formatting'''
list_of_lists = [
['Trades', total_tickets],
['Share %', share],
['Fill Rate', fill_rate],
['Total Size', total_size],
['Avg Size', avg_size],
['1s Toxic', one_toxicity],
['1s Avg', one_average],
['1s 25th', one_low],
['1s 50th', one_med],
['1s 75th', one_high],
['5s Avg', five_average],
['5s 25th', five_low],
['5s 50th', five_med],
['5s 75th', five_high],
['10s Avg', ten_average],
['10s 25th', ten_low],
['10s 50th', ten_med],
['10s 75th', ten_high],
['20s Avg', twenty_avg],
['20s 25th', twenty_low],
['20s 50th', twenty_med],
['20s 75th', twenty_high]
]
result = pd.concat([lst[1] for lst in list_of_lists], axis=1)
result.columns = [lst[0] for lst in list_of_lists]
result = result[result.Trades > 0] # Removes results that are less than 1...use '!= 0' to remove only 0 trades
result.to_excel(writer, sheet_name=sheetname)
workbook = writer.book
#add sheet name
worksheet = writer.sheets[sheetname]
worksheet.set_zoom(80)
#Worksheet and Print Options
worksheet.hide_gridlines(2)
worksheet.fit_to_pages(1, 1)
writer.save()