我正在尝试将一个小数据帧(dfSmall)与一个无法容纳在内存中的巨大数据帧(dfLarge)合并到内存中。它们都太大了,不能在这里发布,但看起来像是:
dfSmall = pd.read_table('small.csv', dtype='str', header=None, skiprows=1, names=['ix', '#CHROM', 'POS', 'ID', 'sample', 'allele', 'pop', 'superpop'])
def merge_it(c):
return dfSmall.merge(c, on=['#CHROM', 'POS'], suffixes=('', '_y'))[header_line]
dfFull = pd.concat([merge_it(c) for c in pd.read_table(large.vcf.gz, header = None, names = header_line, dtype='str', engine = 'c',compression = 'gzip', skiprows=251, chunksize=40000, low_memory=False)])
match = re.search(r'ALL.(chr\d+)', chromosome)
dfFull.to_csv(r"{}.csv".format(match.group(1)))
这是我的代码:
header_line
其中['#CHROM','POS','ID','REF','ALT','QUAL','FILTER',..., 2500 strings]
= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 ...
当我运行它时,我没有错误,但我的输出文件只是标题:
#CHROM
我手动检查了一些条目,因此我知道这两个文件中的行在POS
和dtype='str'
列中都有视觉匹配。
我认为获取仅包含标题的输出文件的问题可能是因为列数据类型不匹配,这就是我明确设置dtype('O')
的原因。但是,检查dfLarge的dtypes会给我str
,而不是#CHROM/POS
。他们可能在let fs = require("fs"),
express = require("express"),
_ = require("underscore"),
User = require("./models/user"),
path = require("path");
let getFileAddUser = () => {
let filePath = '../automation_projects/wss-automation-u/results/temp/';
fs.readdir(filePath, (err, files) => {
if (err) { throw err; }
let file = getMostRecentFile(files, filePath);
console.log(file);
fs.readFile(filePath + file, 'utf8', (err, data) => {
let json = JSON.parse(data);
if(err){
console.error(err);
return;
} else {
//Un-comment to write to most recent file.
//==================================================
//This should find the currently logged in user and append them to the most recent file found.
json.currentuser = req.user;
fs.writeFile(filePath + file, JSON.stringify(json), (error) => {
if(error){
console.error(error);
return;
} else {
console.log(json);
}
});
//==================================================
console.log(data);
}
});
});
};
//Get the most recent file from the results folder.
function getMostRecentFile(files, path) {
let out = [];
files.forEach(function(file) {
let stats = fs.statSync(path + "/" +file);
if(stats.isFile()) {
out.push({"file":file, "mtime": stats.mtime.getTime()});
}
});
out.sort(function(a,b) {
return b.mtime - a.mtime;
})
return (out.length>0) ? out[0].file : "";
}
module.exports = getFileAddUser;
列上不匹配,因为dtypes不同吗?如果这不是问题,还有其他想法吗?
答案 0 :(得分:0)
我认为您的问题来自解析文件的方式 - dfSmall中包含逗号。这是我删除逗号后得到的内容:
df_m = pd.merge(dfSmall, dfLarge, on=['POS', 'CHROM'], how='inner')
dfSmall
Out[100]:
CHROM POS sample allele pop super pop.1
0 1 1121557 rs112904239 HG00096 T GBR EUR
1 1 1213223 rs113095492 HG00096 T GBR EUR
2 1 1000894 rs114006445 HG00096 T GBR EUR
dfLarge
Out[102]:
CHROM POS ID REF ALT QUAL FILTER
0 1 14719 rs527865771 C A 100 PASS
1 1 14728 rs547701710 C A 100 PASS
2 1 1213223 rs113095492 A G 100 PASS
df_m
Out[103]:
CHROM POS sample allele pop super pop.1 ID REF ALT \
0 1 1213223 rs113095492 HG00096 T GBR EUR rs113095492 A G
QUAL FILTER
0 100 PASS