我对数据库很新(通常在CSV或HDFS文件上本地存储所有内容)。我可以访问MS SQL Server,并且我一直在尝试将一些历史数据加载到数据库中。我最终成功了,我想重做我的其他数据的步骤,但首先我要确保我的效率。
1)我使用查询创建一个表,并指定每列的数据类型和字符数(我在Excel中使用了len(),然后添加了一个缓冲区以防万一)
2)然后我创建了一个XML文件,例子如下。我记录了每列的分隔符和数据类型以及max_length和precision / scale之类的内容。
<?xml version="1.0"?>
<BCPFORMAT xmlns="http://schemas.microsoft.com/sqlserver/2004/bulkload/format" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<RECORD>
<FIELD ID="1" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="2" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="3" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="4" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="5" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="6" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="7" xsi:type="CharTerm" TERMINATOR="," MAX_LENGTH="10"/>
<FIELD ID="8" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="9" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="10" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="11" xsi:type="CharTerm" TERMINATOR="," MAX_LENGTH="3"/>
<FIELD ID="12" xsi:type="CharTerm" TERMINATOR="," MAX_LENGTH="3"/>
<FIELD ID="13" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="14" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="15" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="16" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="17" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="18" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="19" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="20" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="21" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="22" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="23" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="24" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="25" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="26" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="27" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="28" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="29" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="30" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="31" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="32" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="33" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="34" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="35" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="36" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="37" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="38" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="39" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="40" xsi:type="CharTerm" TERMINATOR=","/>
<FIELD ID="41" xsi:type="CharTerm" TERMINATOR="\r\n"/>
</RECORD>
<ROW>
<COLUMN SOURCE="1" NAME="date" xsi:type="SQLDATE"/>
<COLUMN SOURCE="2" NAME="time" xsi:type="SQLDATETIME"/>
<COLUMN SOURCE="3" NAME="other_date" xsi:type="SQLDATE"/>
<COLUMN SOURCE="4" NAME="other_time" xsi:type="SQLDATETIME"/>
<COLUMN SOURCE="5" NAME="client_id" xsi:type="SQLINT"/>
<COLUMN SOURCE="6" NAME="location_id" xsi:type="SQLSMALLINT"/>
<COLUMN SOURCE="7" NAME="other_id" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="8" NAME="stuff_1" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="9" NAME="stuff_2" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="10" NAME="email" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="11" NAME="something_active" xsi:type="SQLVARYCHAR"/>
<COLUMN SOURCE="12" NAME="something_active" xsi:type="SQLVARYCHAR"/>
<COLUMN SOURCE="13" NAME="etc" xsi:type="SQLSMALLINT"/>
<COLUMN SOURCE="14" NAME="etc2" xsi:type="SQLDECIMAL" PRECISION="18" SCALE="2"/>
<COLUMN SOURCE="15" NAME="filler" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="16" NAME="filler" xsi:type="SQLNVARCHAR"/>
<COLUMN SOURCE="17" NAME="fk1" xsi:type="SQLSMALLINT"/>
<COLUMN SOURCE="18" NAME="fk2" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="19" NAME="fk3" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="20" NAME="fk4" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="21" NAME="fk5" xsi:type="SQLSMALLINT"/>
<COLUMN SOURCE="22" NAME="fk6" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="23" NAME="fk7" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="24" NAME="data1" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="25" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="26" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="27" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="28" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="29" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="30" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="31" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="32" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="33" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="34" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="35" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="36" NAME="data" xsi:type="SQLDECIMAL" PRECISION="5" SCALE="2"/>
<COLUMN SOURCE="37" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="38" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="39" NAME="data" xsi:type="SQLTINYINT"/>
<COLUMN SOURCE="40" NAME="data" xsi:type="SQLDECIMAL" PRECISION="5" SCALE="0"/>
<COLUMN SOURCE="41" NAME="data" xsi:type="SQLTINYINT"/>
</ROW>
</BCPFORMAT>
这很有用,几百万行的速度非常快。我现在的问题是我是否需要为我要导入的每个文件执行此操作(每个文件可能大约有100万到大约2000万条记录)?为每个文件创建一个单独的CREATE TABLE查询和XML文件有点单调乏味,但它应该只是一次性的事情。只是想确保我不会错过更有效的方法来处理这一切。