模糊匹配的问题

时间:2019-05-14 15:22:22

标签: python dataframe fuzzywuzzy

我正在尝试根据条件从2个数据框中提取一组行。之后,我尝试比较基于键(RECORD_ID)提取的行的每一列的值。当我尝试比较完整的数据帧时,与比较数据帧中提取的行相比,我得到了不同的结果。根据数据框中可用的数据,两个结果必须相同。

import pandas as pd
from pathlib import Path
import datetime
import numpy as np
#import sys
from fuzzywuzzy import fuzz
from fuzzywuzzy import process    

use_case_num = input("Enter the use case number being tested : ")
#pol_num = input("Enter the Policy Number that is being tested : ")
use_case_fl = 'N'

#The Use Case matrix is being specified here
path_use_case = Path('use_case_matrix.xlsx')
df_use_case = pd.read_excel(path_use_case).fillna(0)
use_case_index = df_use_case.columns[0]

#Check if use case entered by user is present in the use_case_matrix
for row in df_use_case.index:
    if str(use_case_num) == str(df_use_case.at[row,'CASE_IDENTIFIER']):
        use_case_fl = 'Y'
        break
    else:
        use_case_fl = 'N'

if use_case_fl == 'Y':
    #Add the names of the spreadsheets with data that you would want to compare
    path_ideal = Path('stat_prem_ideal_0502.xlsx')
    path_actual = Path('stat_prem_actual_0502.xlsx')

    df = pd.read_excel(path_actual)

    #The first column should ideally be the key/index for this comparison. If not, change the index to represent the right index
    index_col = df.columns[0]
    print('\nIndex column: {}\n'.format(index_col))

    df_ideal = pd.read_excel(path_ideal, index_col=index_col).fillna(0)
    df_actual = pd.read_excel(path_actual, index_col=index_col).fillna(0)

    for row in df_ideal.index:
        df_ideal_select = df_ideal.loc[df_ideal.CASE_IDENTIFIER == int(use_case_num)]

    for row in df_actual.index:
        df_actual_select = df_actual.loc[df_actual.CASE_IDENTIFIER == int(use_case_num)]


    i=0
    match=0
    check=0
    alert=0
    error=0
    no_of_rows = len(df_actual)
    column_names=df_ideal.columns
    var_names_ideal=column_names+'_IDEAL'
    var_names_actual=column_names+'_ACTUAL'
    tokens=column_names+'_TOKEN'
    matched_rows = []

    #Looping through the rows and columns of both spreadsheets to compare values
    for i in range(0,len(column_names)):
        for row in df_ideal_select.index: <---- when i use df_ideal in this entire loop, I get the expected results
            var_names_ideal.i=df_ideal_select.at[row,column_names[i]]
        for row in df_actual_select.index: <---- when i use df_actual in this entire loop, I get the expected results
            var_names_actual.i=df_actual_select.at[row,column_names[i]]
        for row in df_actual_select.index:
            tokens.i = fuzz.partial_ratio(str(var_names_ideal.i),str(var_names_actual.i))
            matched_rows.append([row,column_names[i],var_names_ideal.i,var_names_actual.i,tokens.i])

            matched_rows.sort(key=lambda x : x[0])
            dfDiff=pd.DataFrame(matched_rows)

            if tokens.i == 100:
                match = match+1
            elif (tokens.i > 70) and (tokens.i < 100):
                check = check+1
            elif (tokens.i > 50) and (tokens.i <= 70):
                alert = alert+1
            elif (tokens.i <= 50):
                error = error+1

    #Writing the output spreadsheet with comparison ratios
    fname = 'test-fuzzy.xlsx'
    writer = pd.ExcelWriter(fname, engine='xlsxwriter')

    dfDiff.to_excel(writer, sheet_name='Fuzz Match Details', index=True)
    df_actual_select.to_excel(writer, sheet_name=path_actual.stem, index=True)
    df_ideal_select.to_excel(writer,sheet_name=path_ideal.stem,index=True)
    workbook  = writer.book

    #Excel edits for detail sheet
    worksheet = writer.sheets['Fuzz Match Details']
    worksheet.set_default_row(15)
    worksheet.set_column('B:B',5)
    worksheet.set_column('C:E',40)
    worksheet.set_column('F:F',10)
    worksheet.write('B1','INDEX')
    worksheet.write('C1','COLUMN NAME')
    worksheet.write('D1','IDEAL VALUES')
    worksheet.write('E1','ACTUAL VALUES')
    worksheet.write('F1','MATCH RATIO')

    match_fmt = workbook.add_format({'bg_color':'green'})
    check_fmt = workbook.add_format({'bg_color':'blue'})
    alert_fmt = workbook.add_format({'bg_color':'#FFA500'})
    error_fmt = workbook.add_format({'bg_color':'red'})

    #Formatting colors for the spreadsheet depicting problematic fields
    worksheet.conditional_format('F2:F10000', {'type': 'cell',
                                            'criteria': 'between',
                                            'minimum':71,
                                            'maximum':99,
                                            'format': check_fmt})
    worksheet.conditional_format('F2:F10000', {'type': 'cell',
                                            'criteria': 'between',
                                            'minimum':51,
                                            'maximum':70,
                                            'format': alert_fmt})
    worksheet.conditional_format('F2:F10000', {'type': 'cell',
                                            'criteria': '<=', 
                                            'value':50,
                                            'format': error_fmt})
    worksheet.conditional_format('F2:F10000', {'type': 'cell',
                                            'criteria': '==',
                                            'value':100,
                                            'format': match_fmt})


    writer.save()
    print('Number of matched rows: {}'.format(match))
    print('Number of rows to be checked: {}'.format(check))
    print('Number of rows on alert: {}'.format(alert))
    print('Number of erroneous rows: {}'.format(error))
    print('\n Done')
else:
    print ("Use case not defined!")

0 个答案:

没有答案