获取DataFrame作为参数

Question

我有一个地址匹配算法，该算法将一个客户ID的地址的密码与另一个客户ID的密码进行比较，并创建具有类似地址类型的客户ID的列表。接下来，我为每个客户ID列表分配一个特定的组号。但是问题是，大多数客户ID列表都按适当的组号分组，但是即使某些ID具有相同的地址，它们也按不同的数字分组。

我制作了地址匹配脚本，找到了具有相似地址的客户ID，并将其放在列表中，并为每个列表分配了一个组号。但是，即使在地址相似且位于同一列表中的组号不同之后，组号对于某些ID和某些ID也是正确的

import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def how_similar(address,address_list):

    address = address.strip()
    address1_split = address.split(" ")
    pinCode_address1 = address1_split[-1]
    area_address1 = " ".join(address1_split[:-1])

    matched_address = []

    for add in address_list:
        score = 0
        add = add.strip()
        address2_split = add.split(" ")
        pinCode_address2 = address2_split[-1]
        area_address2 = " ".join(address2_split[:-1])


        area_score = fuzz.token_set_ratio(area_address1,area_address2)

        score+=area_score

        if score>= 90 :
            matched_address.append(add)


    return matched_address

获取DataFrame作为参数

def pat_match(df):

    # Getting the column values of id and address in seprate list
    id = df['COD_CUST_ID'].values.tolist()
    address = df['ADDRESS'].values.tolist()

    # Creating a new column with name 'Ids'
    df['Ids'] = ""
    length01=len(id)

    for y in range(0,length01):
        #print(y)
        # The mathched address Id will will be appended in a list for every address
        matched_ids = []

        # Calculating list of address with match percentage more than 80%
        #score=process.extractBests(address[y],address,score_cutoff=80)

        score= how_similar(address[y],address)


        # Iterating over every address returned by score one by one
        for matched_address in score:
            # Getting Customer_ID of every Address
            get = df['COD_CUST_ID'][df['ADDRESS']==matched_address].tolist()
            # Appending the Id into a list
            if get not in matched_ids:
                matched_ids.append(get)


        # Finally Appending the list of matched ID to the column
        df['Ids'][df['COD_CUST_ID']==id[y]] = str(matched_ids)

####################分组代码

   # Creating a new column with name 'Group' and 'Matched'
    df['group'] = ""
    df['Matched'] = ""
    grouped_customers = []

    Group_count = 1
    for i in range(length01):

        try:

            id_list = df['Ids'][i]
            new_list = id_list[1:-1]

            new_list=new_list.replace('[','')
            new_list=new_list.replace(']','')

            new_list = new_list.split(',')
            new_list = [int(x) for x in new_list]
            #print(new_list)
       # print(i)
            for id in new_list:

                if id not in grouped_customers:

                    df['group'][df['COD_CUST_ID'] == id] = Group_count
                    df['Matched'][df['COD_CUST_ID'] == id] = id

                    grouped_customers.append(id)

                    if id == new_list[-1]:
                        Group_count+=1

        except Exception as e:
            print(str(e))

    df.to_csv('match_score_8.csv',sep=',',index=None)

if __name__ == '__main__':
    data = pd.read_csv('address.csv')
    pat_match(data)

假设我的输出为

COD_CUST_ID         ADDRESS                                       Ids              group     Matched
23656386    VPO- DHANGER KHER ABOHAR  152116    [[23656386], [23656388, 23656387]]  8   23656386
23656388    VPO- DHANGER KHERA ABOHAR  152116   [[23656386], [23656388, 23656387]]  8   23656388
23656387    VPO- DHANGER KHERA ABOHAR  152116   [[23656386], [23656388, 23656387]]  8   23656387

这是大多数ID所需的实际输出。但是有一些ID的输出就像

COD_CUST_ID         ADDRESS                         Ids                     group   Matched
23656887    TODAWATA KI DHANI FATEHPURA BANSA 303806    [[23656887], [23666683], [23666717, 23666721, 23666773]]    416 23656933
23666683    - - FATEHPURA BANSA 303806        [[23656980], [23656887], [23666683], [23666717, 23666721, 23666773]]  196 23666664
23666717    BANSA   303806                                  [similar list of matching ids]  151 23666717
23666721    BANSA   303806                                  [similar list of matching ids]  151 23666721
23666773    BANSA   303806                      [similar list of matching ids]  151 23666773

匹配的ID应该在同一组中

组号输入错误

获取DataFrame作为参数

0 个答案: