我正在尝试根据条件从2个数据框中提取一组行。之后,我尝试比较基于键(RECORD_ID)提取的行的每一列的值。当我尝试比较完整的数据帧时,与比较数据帧中提取的行相比,我得到了不同的结果。根据数据框中可用的数据,两个结果必须相同。
import pandas as pd
from pathlib import Path
import datetime
import numpy as np
#import sys
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
use_case_num = input("Enter the use case number being tested : ")
#pol_num = input("Enter the Policy Number that is being tested : ")
use_case_fl = 'N'
#The Use Case matrix is being specified here
path_use_case = Path('use_case_matrix.xlsx')
df_use_case = pd.read_excel(path_use_case).fillna(0)
use_case_index = df_use_case.columns[0]
#Check if use case entered by user is present in the use_case_matrix
for row in df_use_case.index:
if str(use_case_num) == str(df_use_case.at[row,'CASE_IDENTIFIER']):
use_case_fl = 'Y'
break
else:
use_case_fl = 'N'
if use_case_fl == 'Y':
#Add the names of the spreadsheets with data that you would want to compare
path_ideal = Path('stat_prem_ideal_0502.xlsx')
path_actual = Path('stat_prem_actual_0502.xlsx')
df = pd.read_excel(path_actual)
#The first column should ideally be the key/index for this comparison. If not, change the index to represent the right index
index_col = df.columns[0]
print('\nIndex column: {}\n'.format(index_col))
df_ideal = pd.read_excel(path_ideal, index_col=index_col).fillna(0)
df_actual = pd.read_excel(path_actual, index_col=index_col).fillna(0)
for row in df_ideal.index:
df_ideal_select = df_ideal.loc[df_ideal.CASE_IDENTIFIER == int(use_case_num)]
for row in df_actual.index:
df_actual_select = df_actual.loc[df_actual.CASE_IDENTIFIER == int(use_case_num)]
i=0
match=0
check=0
alert=0
error=0
no_of_rows = len(df_actual)
column_names=df_ideal.columns
var_names_ideal=column_names+'_IDEAL'
var_names_actual=column_names+'_ACTUAL'
tokens=column_names+'_TOKEN'
matched_rows = []
#Looping through the rows and columns of both spreadsheets to compare values
for i in range(0,len(column_names)):
for row in df_ideal_select.index: <---- when i use df_ideal in this entire loop, I get the expected results
var_names_ideal.i=df_ideal_select.at[row,column_names[i]]
for row in df_actual_select.index: <---- when i use df_actual in this entire loop, I get the expected results
var_names_actual.i=df_actual_select.at[row,column_names[i]]
for row in df_actual_select.index:
tokens.i = fuzz.partial_ratio(str(var_names_ideal.i),str(var_names_actual.i))
matched_rows.append([row,column_names[i],var_names_ideal.i,var_names_actual.i,tokens.i])
matched_rows.sort(key=lambda x : x[0])
dfDiff=pd.DataFrame(matched_rows)
if tokens.i == 100:
match = match+1
elif (tokens.i > 70) and (tokens.i < 100):
check = check+1
elif (tokens.i > 50) and (tokens.i <= 70):
alert = alert+1
elif (tokens.i <= 50):
error = error+1
#Writing the output spreadsheet with comparison ratios
fname = 'test-fuzzy.xlsx'
writer = pd.ExcelWriter(fname, engine='xlsxwriter')
dfDiff.to_excel(writer, sheet_name='Fuzz Match Details', index=True)
df_actual_select.to_excel(writer, sheet_name=path_actual.stem, index=True)
df_ideal_select.to_excel(writer,sheet_name=path_ideal.stem,index=True)
workbook = writer.book
#Excel edits for detail sheet
worksheet = writer.sheets['Fuzz Match Details']
worksheet.set_default_row(15)
worksheet.set_column('B:B',5)
worksheet.set_column('C:E',40)
worksheet.set_column('F:F',10)
worksheet.write('B1','INDEX')
worksheet.write('C1','COLUMN NAME')
worksheet.write('D1','IDEAL VALUES')
worksheet.write('E1','ACTUAL VALUES')
worksheet.write('F1','MATCH RATIO')
match_fmt = workbook.add_format({'bg_color':'green'})
check_fmt = workbook.add_format({'bg_color':'blue'})
alert_fmt = workbook.add_format({'bg_color':'#FFA500'})
error_fmt = workbook.add_format({'bg_color':'red'})
#Formatting colors for the spreadsheet depicting problematic fields
worksheet.conditional_format('F2:F10000', {'type': 'cell',
'criteria': 'between',
'minimum':71,
'maximum':99,
'format': check_fmt})
worksheet.conditional_format('F2:F10000', {'type': 'cell',
'criteria': 'between',
'minimum':51,
'maximum':70,
'format': alert_fmt})
worksheet.conditional_format('F2:F10000', {'type': 'cell',
'criteria': '<=',
'value':50,
'format': error_fmt})
worksheet.conditional_format('F2:F10000', {'type': 'cell',
'criteria': '==',
'value':100,
'format': match_fmt})
writer.save()
print('Number of matched rows: {}'.format(match))
print('Number of rows to be checked: {}'.format(check))
print('Number of rows on alert: {}'.format(alert))
print('Number of erroneous rows: {}'.format(error))
print('\n Done')
else:
print ("Use case not defined!")