Python sqlalchemy大文件问题

时间:2017-01-20 20:03:08

标签: python python-2.7 sqlalchemy

我在使用以下代码将大型(23,000条记录,10个字段)机场代码csv文件加载到sqlalchemy数据库时遇到问题:

from numpy import genfromtxt
from time import time
from datetime import datetime
from sqlalchemy import Column, Integer, Float, Date, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

def Load_Data(file_name):
    f = lambda s: str(s)
    data = genfromtxt(file_name, delimiter=',', skiprows=1, converters={0: f, 1:f, 2:f, 6:f, 7:f, 8:f, 9:f, 10:f})
    return data.tolist()

Base = declarative_base()

class AirportCode(Base):
    #Tell SQLAlchemy what the table name is and if there's any table-specific arguments it should know about
    __tablename__ = 'AirportCode'
    __table_args__ = {'sqlite_autoincrement': True}
    #tell SQLAlchemy the name of column and its attributes:
    id = Column(Integer, primary_key=True, nullable=False)
    ident = Column(String)
    type = Column(String)
    name = Column(String)
    latitude_deg = Column(String)
    longitude_deg = Column(String)
    elevation_ft = Column(String)
    continent = Column(String)
    iso_country = Column(String)
    iso_region = Column(String)
    municipality = Column(String)
    gps_code = Column(String)


    def __repr__(self):
        #return "<AirportCode(name='%s', municipality='%s')>\n" % (self.name, self.municipality)
        return "name:{} municipality:{}\n".format(self.name, self.municipality)

if __name__ == "__main__":
    t = time()

    #Create the database
    engine = create_engine('sqlite:///airport-codes.db')
    Base.metadata.create_all(engine)

    #Create the session
    session = sessionmaker()
    session.configure(bind=engine)
    s = session()


    records_to_commit = 0
    file_name = "airport-codes.csv"       #23,000 records fails at next line
    #file_name = "airport-codes.alaska      250 records works fine" 
    print file_name #for debugging
    data = Load_Data(file_name)  # fails here on large files and triggers the except: below
    print 'file loaded' #for debugging

    for i in data:
        records_to_commit += 1


        record = AirportCode(**{

            'ident' : i[0].lower(),
            'type' : i[1].lower(),  
            'name' : i[2].lower(), 
            'latitude_deg' : i[3], 
            'longitude_deg' : i[4], 
            'elevation_ft' : i[5], 
            'continent' : i[6], 
            'iso_country' : i[7], 
            'iso_region' : i[8], 
            'municipality' : i[9].lower(), 
            'gps_code' : i[10].lower() 


        })

        s.add(record) #Add all the records

        #if records_to_commit == 1000:
            #s.flush() #Attempt to commit batch of 1000 records
            #records_to_commit = 0
    s.commit() # flushes everything remaining + commits

    s.close() #Close the connection
    print "Time elapsed: " + str(time() - t) + " s."

我在此论坛的其他帖子中修改了此代码,如果我使用仅有250条记录的主要csv文件(阿拉斯加机场)的子集,它就可以正常工作。

当我尝试23,000条记录的整个数据库时,程序无法在代码中的这一行加载:

data = Load_Data(file_name)

我正在研究树莓派3

1 个答案:

答案 0 :(得分:0)

感谢您提供的有用评论。删除try / except显示问题。在加载文件时,有许多国际字符,字段内的额外逗号和特殊字符等导致问题。阿拉斯加机场的参赛作品没有错误,所以装得很好。

数据库现在可以在32秒内加载22,000条记录。我删除了大约1000个条目,因为它们是外国条目,我希望这是一个美国机场目录