长度不符

时间:2019-10-26 00:42:10

标签: python pandas dataframe geocoder

我编写了将物理地址转换为地理位置→纬度和经度的代码,但是该代码显示了此错误

ValueError:长度不匹配:预期轴有7个元素,新值有4个元素


  File "<ipython-input-261-0135aa76d655>", line 1, in <module>
    runfile('C:/Users/xxx/.spyder-py3/eso.py', wdir='C:/Users/xxx/.spyder-py3')

  File "C:\Users\xxx\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
    execfile(filename, namespace)

  File "C:\Users\xxx\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)

  File "C:/Users/xxx/.spyder-py3/eso.py", line 192, in <module>
    write_data(results, i + start_index + 1)

  File "C:/Users/xxx/.spyder-py3/eso.py", line 127, in write_data
    done.columns = ['Full_Address', 'Lat', 'Long', 'Provider']

  File "C:\Users\xxx\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py", line 5080, in __setattr__
    return object.__setattr__(self, name, value)

  File "pandas/_libs/properties.pyx", line 69, in pandas._libs.properties.AxisProperty.__set__

  File "C:\Users\xxx\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py", line 638, in _set_axis
    self._data.set_axis(axis, labels)

  File "C:\Users\xxx\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\internals\managers.py", line 155, in set_axis
    'values have {new} elements'.format(old=old_len, new=new_len))

ValueError: Length mismatch: Expected axis has 7 elements, new values have 4 elements

当前输出为

当前输出将结果存储在列表中,如所附图像所示 enter image description here

预期输出为:

将结果存储在 DataFrame 而不是 list 中,其中 DataFrame 6列(“ Client_ID “,“ Client_Name”,“ Site_ID”,“ Full_Address”,“ Latitude”,“ Longitude”),然后以相同顺序将DataFrame写入CSV文件。

该代码几乎不需要修改。

到目前为止,我已经编写了此脚本。

import geocoder
import requests
import time
import glob
import pandas as pd
# ----------------------------- CONFIGURATION -----------------------------#
path = r'D:\SWAM\ERP_Data'  # Path of Data

all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
     df = pd.read_csv(filename,sep=';', index_col=None, header=0,encoding='latin-1')
     #df = pd.read_csv(filename, sep='\t', index_col=None, header=0)
     li.append(df)
ERP_Data = pd.concat(li, axis=0, ignore_index=True)

# rename the columns name
ERP_Data.columns = ['Client_ID', 'Client_Name', 'FORME_JURIDIQUE_CLIENT', 'CODE_ACTIVITE_CLIENT', 'LIB_ACTIVITE_CLIENT', 'NACE', 
                'Company_Type', 'Number_of_Collected_Bins', 'STATUT_TI', 'TYPE_TI', 'HEURE_PASSAGE_SOUHAITE', 'FAMILLE_AFFAIRE',
                'CODE_AFFAIRE_MOUVEMENT', 'TYPE_MOUVEMENT_PIECE', 'Freq_Collection', 'Waste_Type', 'CDNO', 'CDQTE', 
                'BLNO', 'Collection_Date', 'Weight_Ton', 'Bin_Capacity', 'REF_SS_REF_CONTENANT_BL', 'REF_DECHET_PREVU_TI', 
                'Site_ID', 'Site_Name', 'Street', 'ADRCPL1_SITE', 'ADRCPL2_SITE', 'Post_Code',
                'City', 'Country','ZONE_POLYGONE_SITE' ,'OBSERVATION_SITE', 'OBSERVATION1_SITE', 'HEURE_DEBUT_INTER_MATIN_SITE', 
                'HEURE_FIN_INTER_MATIN_SITE', 'HEURE_DEBUT_INTER_APREM_SITE', 'HEURE_DEBUT_INTER_APREM_SITE', 'JOUR_PASSAGE_INTERDIT', 'PERIODE_PASSAGE_INTERDIT', 'JOUR_PASSAGE_IMPERATIF',
                'PERIODE_PASSAGE_IMPERATIF']


# extract specfic columns into a new dataframe
Address_Table= ERP_Data[['Client_ID', 'Client_Name','Site_ID','Street','Post_Code','City','Country']].copy()

# Clean existing whitespace from the ends of the strings
Address_Table=Address_Table.apply(lambda x: x.str.strip(), axis=1)

Address_Table['Site_ID'] =ERP_Data['Site_ID']

# Adding a new column called (Full_Address) that concatenate address columns into one 
# for example   Karlaplan 13,115 20,STOCKHOLM,Stockholms län, Sweden
Address_Table['Full_Address'] = Address_Table[Address_Table.columns[3:]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)

# remove duplicate values in order to minimize the number of gps requests 
 Temp=Address_Table.drop_duplicates(subset=["Client_ID", "Site_ID", "Client_Name" , "Full_Address"])  # df
addresses=Temp["Full_Address"]

Temp1=Temp.reset_index()
#Temp1=Temp1.drop(['index'], axis=1)  # remove the first column 'Address1'


# ----------------------------- CONFIGURATION -----------------------------#

# Set the input and output files
#input_file_path = "input.csv"
input_file_path = "input.csv"
output_file_path = "output"  # appends "####.csv" to the file name when it writes the file.



# Where the program starts processing the addresses in the input file
# This is useful in case the computer crashes so you can resume the program where it left off or so you can run multiple
# instances of the program starting at different spots in the input file
start_index = 0
# How often the program prints the status of the running program
status_rate = 100
# How often the program saves a backup file
write_data_rate = 1000
# How many times the program tries to geocode an address before it gives up
attempts_to_geocode = 3
# Time it delays each time it does not find an address
# Note that this is added to itself each time it fails so it should not be set to a large number
wait_time = 3



# ----------------------------- Function Definitions -----------------------------#

# Creates request sessions for geocoding
class GeoSessions:
    def __init__(self):
        self.Arcgis = requests.Session()
        self.Komoot = requests.Session()


# Class that is used to return 3 new sessions for each geocoding source
def create_sessions():
    return GeoSessions()


# Main geocoding function that uses the geocoding package to covert addresses into lat, longs
def geocode_address(address, s):
    g = geocoder.arcgis(address, session=s.Arcgis)
    if (g.ok == False):
        g = geocoder.komoot(address, session=s.Komoot)

    return g


def try_address(address, s, attempts_remaining, wait_time):
    g = geocode_address(address, s)
    if (g.ok == False):
        time.sleep(wait_time)
        s = create_sessions()  # It is not very likely that we can't find an address so we create new sessions and wait
        if (attempts_remaining > 0):
            try_address(address, s, attempts_remaining-1, wait_time+wait_time)
    return g


# Function used to write data to the output file
def write_data(data, index):
    file_name = (output_file_path + str(index) + ".csv")
    print("Created the file: " + file_name)
    done = pd.DataFrame(data)
    done.columns = ['Full_Address', 'Lat', 'Long', 'Provider']
    done.to_csv((file_name + ".csv"), sep=',', encoding='utf8')

# Variables used in the main for loop that do not need to be modified by the user
s = create_sessions()
results = []
failed = 0
total_failed = 0
progress = len(addresses) - start_index

# ----------------------------- Main Loop -----------------------------#

for i, address in enumerate(addresses[start_index:]):
    # Print the status of how many addresses have be processed so far and how many of the failed.
    if ((start_index + i) % status_rate == 0):
        total_failed += failed
        print(
            "Completed {} of {}. Failed {} for this section and {} in total.".format(i + start_index, progress, failed,
                                                                                     total_failed))
        failed = 0

    # Try geocoding the addresses
    try:
        g = try_address(address, s, attempts_to_geocode, wait_time)
        if (g.ok == False):
            results.append([address, "was", "not", "geocoded"])
            print("Gave up on address: " + address)
            failed += 1
        else:
#some modification here            results.append([Temp1.Client_ID[i],Temp1.Client_Name[i],Temp1.Site_ID[i],address, g.latlng[0], g.latlng[1], g.provider])



            "Client_ID", "Site_ID", "Client_Name"

    # If we failed with an error like a timeout we will try the address again after we wait 5 secs
    except Exception as e:
        print("Failed with error {} on address {}. Will try again.".format(e, address))
        try:
            time.sleep(5)
            s = create_sessions()
            g = geocode_address(address, s)
            if (g.ok == False):
                print("Did not fine it.")
                results.append([address, "was", "not", "geocoded"])
                failed += 1
            else:
                print("Successfully found it.")

# some modification here                results.append([Temp1.Client_ID[i],Temp1.Client_Name[i],Temp1.Site_ID[i],address, g.latlng[0], g.latlng[1], g.provider])
        except Exception as e:
            print("Failed with error {} on address {} again.".format(e, address))
            failed += 1
            results.append([address, e, e, "ERROR"])

    # Writing what has been processed so far to an output file
    if (i%write_data_rate == 0 and i != 0):
        write_data(results, i + start_index)

    # print(i, g.latlng, g.provider)


# Finished (I guess we need some modification here)
write_data(results, i + start_index + 1)
print("Finished! :)")

0 个答案:

没有答案