我正在尝试从pandas DataFrame创建数据库表。我收到一个奇怪的错误,我无法用一个简单的例子复制。
这是代码
import pandas as pd
import os
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
path = 'sqlite://'
engine = create_engine(path, echo=False)
df_kerbs, df_flows = get_dataframes()
df = pd.merge(df_kerbs, df_flows, on=['Src', 'SrcPort', 'Dst', 'DstPort', 'First'], how='right')
df.to_sql(name='FlowsKerbsMerged', con=engine, index=False, if_exists='append',
index_label='First')
这是输出:
Traceback (most recent call last):
File "merge_data.py", line 61, in <module>
index_label='First')
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/core/generic.py", line 1201, in to_sql
chunksize=chunksize, dtype=dtype)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/io/sql.py", line 470, in to_sql
chunksize=chunksize, dtype=dtype)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/io/sql.py", line 1147, in to_sql
table.create()
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/io/sql.py", line 596, in create
self._execute_create()
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/pandas/io/sql.py", line 581, in _execute_create
self.table.create()
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/sql/schema.py", line 756, in create
checkfirst=checkfirst)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1929, in _run_visitor
conn._run_visitor(visitorcallable, element, **kwargs)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1538, in _run_visitor
**kwargs).traverse_single(element)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/sql/visitors.py", line 121, in traverse_single
return meth(obj, **kw)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/sql/ddl.py", line 767, in visit_table
include_foreign_key_constraints=include_foreign_key_constraints
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 945, in execute
return meth(self, multiparams, params)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/sql/ddl.py", line 68, in _execute_on_connection
return connection._execute_ddl(self, multiparams, params)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1002, in _execute_ddl
compiled
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1189, in _execute_context
context)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1402, in _handle_dbapi_exception
exc_info
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/util/compat.py", line 203, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/base.py", line 1182, in _execute_context
context)
File "/home/lpuggini/MyApps/scientific_python_2_7/lib/python2.7/site-packages/sqlalchemy/engine/default.py", line 470, in do_execute
cursor.execute(statement, parameters)
sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) duplicate column name: Application [SQL: u'\nCREATE TABLE "FlowsKerbsMerged" (\n\t"ClientIP" FLOAT, \n\t"ConversationIdentifier" FLOAT, \n\t"CorvilOrderId" FLOAT, \n\t"DomainName" FLOAT, \n\t"DomainNameSalt" FLOAT, \n\t"HostName" FLOAT, \n\t"HostNameAugmented" FLOAT, \n\t"HostNameSalt" FLOAT, \n\t"KRB AP-REQ.ap-options" FLOAT, \n\t"KRB AP-REQ.msg-type" FLOAT, \n\t"KRB AP-REQ.pvno" FLOAT, \n\t"KRB ETYPE-INFO2-ENTRY.etype" FLOAT, \n\t"KRB ETYPE-INFO2-ENTRY.salt" FLOAT, \n\t"KRB EncryptedData.cipher" FLOAT, \n\t"KRB EncryptedData.etype" FLOAT, \n\t"KRB EncryptedData.kvno" FLOAT, \n\t"KRB HostAddress.addr-type" FLOAT, \n\t"KRB HostAddress.address" FLOAT, \n\t"KRB KDC-REP.crealm" FLOAT, \n\t"KRB KDC-REP.msg-type" FLOAT, \n\t"KRB KDC-REP.pvno" FLOAT, \n\t"KRB KDC-REQ-BODY.etype.values" FLOAT, \n\t"KRB KDC-REQ-BODY.etype.values:repeating" FLOAT, \n\t"KRB KDC-REQ-BODY.kdc-options" FLOAT, \n\t"KRB KDC-REQ-BODY.nonce" FLOAT, \n\t"KRB KDC-REQ-BODY.realm" FLOAT, \n\t"KRB KDC-REQ-BODY.rtime" FLOAT, \n\t"KRB KDC-REQ-BODY.till" FLOAT, \n\t"KRB KDC-REQ.msg-type" FLOAT, \n\t"KRB KDC-REQ.pvno" FLOAT, \n\t"KRB KERB-PA-PAC-REQUEST.include-pac" FLOAT, \n\t"KRB KRB-ERROR.e-data" FLOAT, \n\t"KRB KRB-ERROR.error-code" FLOAT, \n\t"KRB KRB-ERROR.error-code_string" FLOAT, \n\t"KRB KRB-ERROR.error-description_string" FLOAT, \n\t"KRB KRB-ERROR.msg-type" FLOAT, \n\t"KRB KRB-ERROR.pvno" FLOAT, \n\t"KRB KRB-ERROR.realm" FLOAT, \n\t"KRB KRB-ERROR.stime" FLOAT, \n\t"KRB KRB-ERROR.susec" FLOAT, \n\t"KRB PA-DATA.padata-type" FLOAT, \n\t"KRB PA-DATA.padata-value" FLOAT, \n\t"KRB PA-ENC-TS-ENC.patimestamp" FLOAT, \n\t"KRB PA-PAC-OPTIONS.flags" FLOAT, \n\t"KRB PrincipalName.name-string.values" FLOAT, \n\t"KRB PrincipalName.name-string.values:repeating" FLOAT, \n\t"KRB PrincipalName.name-type" FLOAT, \n\t"KRB Ticket.realm" FLOAT, \n\t"KRB Ticket.tkt-vno" FLOAT, \n\t"Msg Decoder" FLOAT, \n\t"Msg ID" FLOAT, \n\t"Msg Type" FLOAT, \n\t"RequestReply" FLOAT, \n\t"ServerIP" FLOAT, \n\t"UserName" FLOAT, \n\t"UserNameAugmented" FLOAT, \n\t"UserNameSalt" FLOAT, \n\t"Watchlist Match" FLOAT, \n\tapplication FLOAT, \n\tbytes FLOAT, \n\tclient_port FLOAT, \n\tdir FLOAT, \n\t"Dst" TEXT, \n\t"DstPort" FLOAT, \n\tserver_port FLOAT, \n\t"Src" TEXT, \n\t"SrcPort" FLOAT, \n\ttime DATETIME, \n\tts_ns FLOAT, \n\t"First" BIGINT, \n\t"Protocol" TEXT, \n\t"Group ID" BIGINT, \n\t"Port" TEXT, \n\t"VPort" BIGINT, \n\t"IP TOS" TEXT, \n\t"VLAN ID" FLOAT, \n\t"VLAN Pri" FLOAT, \n\t"MPLS Exp" FLOAT, \n\t"Application" TEXT, \n\t"Packets" BIGINT, \n\t"Messages" BIGINT, \n\t"Bytes" BIGINT, \n\t"Last" BIGINT, \n\t"LearnedIPs alertable" FLOAT, \n\t"LearnedIPs learned-ip" FLOAT, \n\t"LearnedIPs new-ips" FLOAT, \n\t"LearnedIPs subnet-name" FLOAT, \n\t"LearnedIPs timestamp-sec" FLOAT, \n\t"LearnedIPs total-ips" FLOAT, \n\t"SrcSubnet" TEXT, \n\t"DstSubnet" TEXT\n)\n\n']
(scientific_python_2_7) [lpuggini@machinelearn-1 Kerberos]$
此外,如果之前我没有收到错误:
df.to_sql(name='FlowsKerbsMerged', con=engine, index=False, if_exists='append', index_label='First')
我重命名了两列,即
df = df.rename(columns={'Application': 'Application2', 'Bytes': 'Bytes2'}
总之,我的df
似乎有重复的列,但似乎并非如此:
In [6]: len(df.columns)
Out[6]: 91
In [7]: len(set(df.columns))
Out[7]: 91
错误的原因是什么?
编辑:
这是列列表:
In [5]: df.columns
Out[5]:
Index([u'ClientIP', u'ConversationIdentifier', u'CorvilOrderId', u'DomainName',
u'DomainNameSalt', u'HostName', u'HostNameAugmented', u'HostNameSalt',
u'KRB AP-REQ.ap-options', u'KRB AP-REQ.msg-type', u'KRB AP-REQ.pvno',
u'KRB ETYPE-INFO2-ENTRY.etype', u'KRB ETYPE-INFO2-ENTRY.salt',
u'KRB EncryptedData.cipher', u'KRB EncryptedData.etype',
u'KRB EncryptedData.kvno', u'KRB HostAddress.addr-type',
u'KRB HostAddress.address', u'KRB KDC-REP.crealm',
u'KRB KDC-REP.msg-type', u'KRB KDC-REP.pvno',
u'KRB KDC-REQ-BODY.etype.values',
u'KRB KDC-REQ-BODY.etype.values:repeating',
u'KRB KDC-REQ-BODY.kdc-options', u'KRB KDC-REQ-BODY.nonce',
u'KRB KDC-REQ-BODY.realm', u'KRB KDC-REQ-BODY.rtime',
u'KRB KDC-REQ-BODY.till', u'KRB KDC-REQ.msg-type', u'KRB KDC-REQ.pvno',
u'KRB KERB-PA-PAC-REQUEST.include-pac', u'KRB KRB-ERROR.e-data',
u'KRB KRB-ERROR.error-code', u'KRB KRB-ERROR.error-code_string',
u'KRB KRB-ERROR.error-description_string', u'KRB KRB-ERROR.msg-type',
u'KRB KRB-ERROR.pvno', u'KRB KRB-ERROR.realm', u'KRB KRB-ERROR.stime',
u'KRB KRB-ERROR.susec', u'KRB PA-DATA.padata-type',
u'KRB PA-DATA.padata-value', u'KRB PA-ENC-TS-ENC.patimestamp',
u'KRB PA-PAC-OPTIONS.flags', u'KRB PrincipalName.name-string.values',
u'KRB PrincipalName.name-string.values:repeating',
u'KRB PrincipalName.name-type', u'KRB Ticket.realm',
u'KRB Ticket.tkt-vno', u'Msg Decoder', u'Msg ID', u'Msg Type',
u'RequestReply', u'ServerIP', u'UserName', u'UserNameAugmented',
u'UserNameSalt', u'Watchlist Match', u'application', u'bytes',
u'client_port', u'dir', u'Dst', u'DstPort', u'server_port', u'Src',
u'SrcPort', u'time', u'ts_ns', u'First', u'Protocol', u'Group ID',
u'Port', u'VPort', u'IP TOS', u'VLAN ID', u'VLAN Pri', u'MPLS Exp',
u'Application', u'Packets', u'Messages', u'Bytes', u'Last',
u'LearnedIPs alertable', u'LearnedIPs learned-ip',
u'LearnedIPs new-ips', u'LearnedIPs subnet-name',
u'LearnedIPs timestamp-sec', u'LearnedIPs total-ips', u'SrcSubnet',
u'DstSubnet'],
dtype='object')
答案 0 :(得分:2)
在列表的索引58
处,您有一个名为application
的列。
在列表的索引78
处,您有一个名为Application
的列。
在列表的索引59
处,您有一个名为bytes
的列。
在列表的索引81
处,您有一个名为Bytes
的列。
Python没有将这些视为重复,因为set()
区分大小写,例如:
set(['Foo', 'foo', 'Foo'])
#{'Foo', 'foo'}
您也可以使用以下方式检查:
len(set(map(lambda x: x.lower(), df.columns)))
#89