我有一个csv格式的“大”文件(159 G),我已经使用Sqlite3和C插入到数据库表中。现在,实际的csv文件有1,232,799,308行(10亿+)。但是只插入了(123,254,689)。我有三个问题:
1)如何将插入的数据与csv文件中的实际数据进行比较?我的意思是我想知道感兴趣的数据是来自一个块(即从第1行到第100,000行)还是随机的?
2)根据采样率(平均值,标准差和误差幅度),有没有办法知道插入的行是否代表整个总体?
3)代码中是否存在仅插入10%行的问题!?
提前感谢帮助人员:)
代码在C:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sqlite3.h>
#define INPUTDATA "/nobackup/DSS/clusterdata-2011-1/working_data/task_usage.csv"
#define DATABASE "/nobackup/DSS/clusterdata-2011-1/working_data/google.db"
#define TABLE "CREATE TABLE IF NOT EXISTS task_usageBS(startendtime TEXT, jobID TEXT, taskindex TEXT, machineID TEXT, CPUusage TEXT, MEMusage TEXT, assignedMEM TEXT, unmappedpagecacheMEMusage TEXT, pagecacheMEMusage TEXT, maxMEMusage TEXT, diskIOtimeMEAN TEXT, localdiskspaceusedMEAN TEXT, CPUusagemax TEXT, diskIOtimemax TEXT, cyclesperinstructionCPI TEXT, MEMaccessesperinstructionMAI TEXT, samplingrate TEXT, aggregationtype TEXT, PRIMARY KEY (startendtime, jobID, taskindex), FOREIGN KEY(jobID) REFERENCES job_events(jobID), FOREIGN KEY(taskindex) REFERENCES task_events(taskindex), FOREIGN KEY(machineID) REFERENCES machine_events(machineID))"
#define BUFFER_SIZE 1024
int main(int argc, char **argv) {
sqlite3 * db;
sqlite3_stmt * stmt;
char * sErrMsg = 0;
const char * tail = 0;
int nRetCode;
int n = 0;
clock_t cStartClock;
FILE * pFile;
char sInputBuf [BUFFER_SIZE] = "\0";
char * sTS = 0; /* startendtime */
char * sJI = 0; /* jobID */
char * sTI = 0; /* taskindex */
char * sMI = 0; /* machineid */
char * sCUM = 0; /* CPUusageMEAN */
char * sMU = 0; /* MEMusage */
char * sAM = 0; /* assignedMEM */
char * sUPCMU = 0; /* unmappedpagecacheMEMusage */
char * sPCMU = 0; /* pagecacheMEMusage */
char * sMMU = 0; /* maxMEMusage */
char * sDIOTM = 0; /* diskIOtimeMEAN */
char * sLDSUM = 0; /* localdiskspaceusedMEAN */
char * sCUMAX = 0; /* CPUusagemax */
char * sDIOTMAX = 0; /* diskIOtimemax */
char * sCPI = 0; /* cyclesperinstructionCPI */
char * sMAPI = 0; /* MEMaccessesperinstructionMAI */
char * sSR = 0; /* samplingrate */
char * sAT = 0; /* aggregationtype */
char sSQL [BUFFER_SIZE] = "\0";
/*********************************************/
/* Open the Database and create the Schema */
sqlite3_open(DATABASE, &db);
sqlite3_exec(db, TABLE, NULL, NULL, &sErrMsg);
/*********************************************/
/* Open input file and import into Database*/
cStartClock = clock();
sprintf(sSQL, "INSERT INTO task_usageBS VALUES (@TS, @JI, @TI, @MI, @CUM, @MU, @AM, @UPCMU, @PCMU, @MMU, @DIOTM, @LDSUM, @CUMAX, @DIOTMAX, @CPI, @MAPI, @SR, @AT)");
sqlite3_prepare_v2(db, sSQL, BUFFER_SIZE, &stmt, &tail);
//sqlite3_exec(db, sSQL, NULL, NULL, &sErrMsg);
sqlite3_exec(db, "BEGIN TRANSACTION", NULL, NULL, &sErrMsg);
pFile = fopen (INPUTDATA,"r");
while (!feof(pFile)) {
fgets (sInputBuf, BUFFER_SIZE, pFile);
sTS = strtok (sInputBuf, ",");
sJI = strtok (NULL, ",");
sTI = strtok (NULL, ",");
sMI = strtok (NULL, ",");
sCUM = strtok (NULL, ",");
sMU = strtok (NULL, ",");
sAM = strtok (NULL, ",");
sUPCMU = strtok (NULL, ",");
sPCMU = strtok (NULL, ",");
sMMU = strtok (NULL, ",");
sDIOTM = strtok (NULL, ",");
sLDSUM = strtok (NULL, ",");
sCUMAX = strtok (NULL, ",");
sDIOTMAX = strtok (NULL, ",");
sCPI = strtok (NULL, ",");
sMAPI = strtok (NULL, ",");
sSR = strtok (NULL, ",");
sAT = strtok (NULL, ",");
sqlite3_bind_text(stmt, 1, sTS, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 2, sJI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 3, sTI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 4, sMI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 5, sCUM, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 6, sMU, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 7, sAM, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 8, sUPCMU, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 9, sPCMU, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 10, sMMU, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 11, sDIOTM, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 12, sLDSUM, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 13, sCUMAX, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 14, sDIOTMAX, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 15, sCPI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 16, sMAPI, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 17, sSR, -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 18, sAT, -1, SQLITE_TRANSIENT);
sqlite3_step(stmt);
sqlite3_clear_bindings(stmt);
sqlite3_reset(stmt);
n++;
}
fclose (pFile);
sqlite3_exec(db, "END TRANSACTION", NULL, NULL, &sErrMsg);
printf("Imported %d records in %4.2f seconds\n", n, (clock() - cStartClock) / (double)CLOCKS_PER_SEC);
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
答案 0 :(得分:0)
如何在sqlite3中使用.import导入数据。
sqlite3可以从CSV文件中导入数据,假设条件很少,例如。数字字段中的小数点分隔符与您的区域设置匹配,文本编码也与您的区域设置匹配。当然,目标表中的列数应与CVS文件中的字段数相匹配。
如果符合这些条件,您只需按以下方式调用:
sqlite3 db.sqlite
.import data.csv my_table
db.sqlite是您的数据库文件,其中包含所有必需的架构, data.csv是包含源数据的CSV文件,my_table是您的目标表。