将大量数据导入Firebase数据库的正确方法是什么?

时间:2016-07-25 18:48:47

标签: python json csv firebase-realtime-database ijson

我正在使用政治广告系列贡献的数据集,最终成为大约500mb的JSON文件(最初为124mb CSV)。它太大了,无法在Firebase网络界面中导入(在崩溃谷歌Chrome上的标签之前尝试)。我尝试手动上传CSV中的对象(使用CSVtoJSON转换器,每行成为JSON对象,然后我会将该对象上传到Firebase)。

这是我使用的代码。

var firebase = require('firebase');
var Converter = require("csvtojson").Converter;
firebase.initializeApp({
  serviceAccount: "./credentials.json",
  databaseURL: "url went here"
});
var converter = new Converter({
    constructResult:false,
  workerNum:4
});
var db = firebase.database();
var ref = db.ref("/");

var lastindex = 0;
var count = 0;
var section = 0;
var sectionRef;
converter.on("record_parsed",function(resultRow,rawRow,rowIndex){
    if (rowIndex >= 0) {
        sectionRef = ref.child("reports" + section);
        var reportRef = sectionRef.child(resultRow.Report_ID);
        reportRef.set(resultRow);
        console.log("Report uploaded, count at " + count + ", section at " + section);
        count += 1;
        lastindex = rowIndex;
        if (count >= 1000) {
            count = 0;
            section += 1;
        }
        if (section >= 100) {
            console.log("last completed index: " + lastindex);
            process.exit();
        }
    } else {
        console.log("we out of indices");
        process.exit();
    }

});
var readStream=require("fs").createReadStream("./vUPLOAD_MASTER.csv");
readStream.pipe(converter);

然而,这遇到了内存问题,无法完成数据集。尝试以块的形式进行操作是不可行的,因为Firebase没有显示所有上传的数据,我不知道我在哪里停止。 (当在Chrome中打开Firebase数据库时,我会看到数据进入,但最终选项卡会崩溃,重新加载后很多后来的数据都会丢失。)

然后我尝试使用Firebase Streaming Import,但是会引发此错误:

started at 1469471482.77
Traceback (most recent call last):
  File "import.py", line 90, in <module>
    main(argParser.parse_args())
  File "import.py", line 20, in main
    for prefix, event, value in parser:
  File "R:\Python27\lib\site-packages\ijson\common.py", line 65, in parse
    for event, value in basic_events:
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 185, in basic_parse
    for value in parse_value(lexer):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 127, in parse_value
    raise UnexpectedSymbol(symbol, pos)
ijson.backends.python.UnexpectedSymbol: Unexpected symbol u'\ufeff' at 0

查找最后一行(来自ijson的错误),我找到this SO thread,但我不确定我应该如何使用它来获取Firebase Streaming Import工作

我从我试图上传的JSON文件中删除了使用Vim的字节顺序标记,现在我在运行导入程序一分钟左右后出现此错误:

Traceback (most recent call last):
  File "import.py", line 90, in <module>
    main(argParser.parse_args())
  File "import.py", line 20, in main
    for prefix, event, value in parser:
  File "R:\Python27\lib\site-packages\ijson\common.py", line 65, in parse
    for event, value in basic_events:
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 185, in basic_parse
    for value in parse_value(lexer):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 116, in parse_value
    for event in parse_array(lexer):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 138, in parse_array
    for event in parse_value(lexer, symbol, pos):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 119, in parse_value
    for event in parse_object(lexer):
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 170, in parse_object
    pos, symbol = next(lexer)
  File "R:\Python27\lib\site-packages\ijson\backends\python.py", line 51, in Lexer
    buf += data
MemoryError

Firebase Streaming Importer应该能够处理超过250mb的文件,我相当确定我有足够的RAM来处理这个文件。关于为什么出现这个错误的任何想法?

如果看到我尝试使用Firebase Streaming Import上传的实际JSON文件会有所帮助,here it is

1 个答案:

答案 0 :(得分:0)

我解决了这个问题,放弃了Firebase Streaming Import并编写了我自己的工具,使用csvtojson转换CSV,然后使用Firebase Node API一次上传一个对象。

这是脚本:

var firebase = require("firebase");
firebase.initializeApp({
  serviceAccount: "./credentials.json",
  databaseURL: "https://necir-hackathon.firebaseio.com/"
});

var db = firebase.database();
var ref = db.ref("/reports");
var fs = require('fs');
var Converter = require("csvtojson").Converter;
var header = "Report_ID,Status,CPF_ID,Filing_ID,Report_Type_ID,Report_Type_Description,Amendment,Amendment_Reason,Amendment_To_Report_ID,Amended_By_Report_ID,Filing_Date,Reporting_Period,Report_Year,Beginning_Date,Ending_Date,Beginning_Balance,Receipts,Subtotal,Expenditures,Ending_Balance,Inkinds,Receipts_Unitemized,Receipts_Itemized,Expenditures_Unitemized,Expenditures_Itemized,Inkinds_Unitemized,Inkinds_Itemized,Liabilities,Savings_Total,Report_Month,UI,Reimbursee,Candidate_First_Name,Candidate_Last_Name,Full_Name,Full_Name_Reverse,Bank_Name,District_Code,Office,District,Comm_Name,Report_Candidate_First_Name,Report_Candidate_Last_Name,Report_Office_District,Report_Comm_Name,Report_Bank_Name,Report_Candidate_Address,Report_Candidate_City,Report_Candidate_State,Report_Candidate_Zip,Report_Treasurer_First_Name,Report_Treasurer_Last_Name,Report_Comm_Address,Report_Comm_City,Report_Comm_State,Report_Comm_Zip,Category,Candidate_Clarification,Rec_Count,Exp_Count,Inkind_Count,Liab_Count,R1_Count,CPF9_Count,SV1_Count,Asset_Count,Savings_Account_Count,R1_Item_Count,CPF9_Item_Count,SV1_Item_Count,Filing_Mechanism,Also_Dissolution,Segregated_Account_Type,Municipality_Code,Current_Report_ID,Location,Individual_Or_Organization,Notable_Contributor,Currently_Accessed"
var queue = [];
var count = 0;
var upload_lock = false;
var lineReader = require('readline').createInterface({
  input: fs.createReadStream('test.csv')
});

lineReader.on('line', function (line) {
    var line = line.replace(/'/g, "\\'");
    var csvString = header + '\n' + line;
    var converter = new Converter({});
    converter.fromString(csvString, function(err,result){
        if (err) {
            var errstring = err + "\n";
            fs.appendFile('converter_error_log.txt', errstring, function(err){
                if (err) {
                console.log("Converter: Append Log File Error Below:");
                console.error(err);
                process.exit(1);
            } else {
                console.log("Converter Error Saved");
            }
            });
        } else {
            result[0].Location = "";
            result[0].Individual_Or_Organization = "";
            result[0].Notable_Contributor = "";
            result[0].Currently_Accessed = "";
            var reportRef = ref.child(result[0].Report_ID);
            count += 1;
            reportRef.set(result[0]);
            console.log("Sent #" + count);
      }
    });
});

唯一需要注意的是,虽然脚本可以快速发送所有对象,但Firebase显然需要在保存连接时保留连接,因为在发送所有对象后关闭脚本会导致很多对象无法显示在数据库。 (我等了20分钟才确定,但可能会更短)