Question

我想编写一个函数来清除拼写错误的位置条目或估算缺少的字段。想法是输入州，县和市，并将其与有效的此类位置列表进行比较。然后该函数应输出更正的版本。

下面的函数一直在为我工作，但是它很慢，因为它为每个位置条目进行了大量的字符串比较。鉴于有时县或州将是空的，我还没有找到一种限制比较次数的好方法。但我希望可以有一种更快的方法来获得可接受的结果。

可以在https://simplemaps.com/data/us-cities免费下载具有有效美国城市的数据集。

import pandas as pd
import numpy as np
from difflib import SequenceMatcher as SM

cities = pd.read_csv('Downloads/uscitiesv1.5.csv')

CITIES = [tuple([row['state_name'].upper(), row['county_name'].upper(), row['city'].upper()]) for i, row in cities.iterrows()]

### Helper functions
def distance(a, b):
    return SM(a = a.upper(), b = b.upper(), autojunk = False).ratio()

def findLocation(location):

    ## If more than 2 locations empty, return blank
    if sum([l == '' for l in location]) > 2:
        return ('', '', '')

    ## Cannot guess city if city empty
    if location[2] == '':
        return ('', '', '')

    comparison = {}

    exactOK = [[False, True, True], [True, False, True]]

    for LOC in CITIES:

        exactCompare = [(a == b) for a, b in zip(location, LOC)]

        ## Return location if 2 out of 3 exact matches including city
        if exactCompare in exactOK:
            return LOC

        distState =     distance(location[0], LOC[0])
        distCounty =    distance(location[1], LOC[1])
        distCity =      distance(location[2], LOC[2])

        ## The sum of scores for each of State, County, City
        sumDist = sum([distState, distCounty, distCity])

        comparison.update({LOC: sumDist})

    return max(comparison, key=comparison.get)

def emptyNA(s):

    if str(s).upper() in ['NA', 'MISSING', 'NOT CLEAR', '', ' ', 'NAN']:
        return ''
    else:
        return s

### Full location cleaning function
def cleanCities(data):

    if len(data.columns) != 3:
        print('invalid input')
        return
    if not all(data.dtypes == 'object'):
        print('invalid input')
        return

    data = data.applymap(lambda s: s.upper())

    ## Handle NAs
    data = data.applymap(lambda s: emptyNA(s))
    data = data.fillna('')

    ## Convert input to list of tuples
    cities = [(r[0], r[1], r[2]) for i, r in data.iterrows()]

    for i, city in enumerate(cities):

        ## Implicitly leave exact matches as is
        if city not in CITIES:

            ## Replace
            cities[i] = findLocation(city)

    newData = pd.DataFrame(cities)
    newData.columns = ['state_clean', 'county_clean', 'city_clean']

    return newData

这里介绍了该函数如何处理一些混乱的输入。

test = {'State': ['MARLYAND', '', 'ARZONA', 'NORTHCAROLINA',
                  'FLODA', 'WASHINGTON',  'KENTCKY',  'COLORADO',
                  'ARKANSAS',  'MISSING'],
        'County': ['CAROLINE', 'DOÑANA',  'APAE',  'MITCHELL',
                   'ESCAMBEA',  'LWS',  'CAMPBELL',  'FREMONT',
                   'ARKANSAS',  'ST. LOUIS'],
        'City': ['CHOPTNK',  'AFTON',  'MCNARY',  'LEDGER',  'GOULDNG',
                 'MOSSYROCK',  '',  'CAÑON CITY',  'CROKETS BLUFF',
                 'FRENCHRVER']}

test = pd.DataFrame.from_dict(test)
cleaned = cleanCities(test)

print(test)
           State     County           City
0       MARLYAND   CAROLINE        CHOPTNK
1                    DOÑANA          AFTON
2         ARZONA       APAE         MCNARY
3  NORTHCAROLINA   MITCHELL         LEDGER
4          FLODA   ESCAMBEA        GOULDNG
5     WASHINGTON        LWS      MOSSYROCK
6        KENTCKY   CAMPBELL               
7       COLORADO    FREMONT     CAÑON CITY
8       ARKANSAS   ARKANSAS  CROKETS BLUFF
9        MISSING  ST. LOUIS     FRENCHRVER


print(cleaned)
      state_clean county_clean       city_clean
0        MARYLAND     CAROLINE         CHOPTANK
1      NEW MEXICO     DOÑA ANA            AFTON
2         ARIZONA       APACHE           MCNARY
3  NORTH CAROLINA     MITCHELL           LEDGER
4         FLORIDA     ESCAMBIA         GOULDING
5      WASHINGTON        LEWIS        MOSSYROCK
6                                              
7        COLORADO      FREMONT       CAÑON CITY
8        ARKANSAS     ARKANSAS  CROCKETTS BLUFF
9       MINNESOTA    ST. LOUIS     FRENCH RIVER

使用字符串比较估算/清除位置名称

0 个答案: