如果我有一个文件,bigfile.csv长L + 1行(包括一个标题),我的目标是尽可能均匀地将此文件拆分为N个文件(比如100),将标题附加到每个文件中分裂。假设我知道文件中有多少行,以及它需要分成多少个文件,先验。
目前,我已经有了这段代码:
awk 'NR==1 {h=$0; next} (NR-2)% 100==0 {filename = sprintf("%04d",int((NR-2)/100)+1) ".smallfile.csv"; print h >> filename} {print >> filename}' bigfile.csv
虽然此代码大部分时间都有效,但有两个失败:
我如何在AWK中执行此操作?理想情况下,根据上面的例子,将6021行拆分为100个文件将包含21个文件,其中61行,其余为60行。
我知道你可以在unix中使用split(1)并以循环方式执行,但这会改变行的顺序。稍后这些线将被分配处理并重新组装,因此保持相同序列将更为理想。
答案 0 :(得分:3)
我担心我们不得不诉诸......数学。数学不是很复杂,愉快。我想象一下:
awk -v files=100 'NR == FNR { linecount = FNR; next } FNR == 1 { chunksize = (linecount - 1) / files; h = $0; next } FNR > nextsplit { close(filename); ++chunk; filename = sprintf("%04d.smallfile.csv", chunk); nextsplit = 1 + int(chunksize * chunk + 0.5); print h >> filename } { print >> filename }' bigfile.csv bigfile.csv
为了将文件拆分成偶数块,我们必须知道在开始拆分之前它有多少行,所以这需要两次通过 - 这就是为什么bigfile.csv
被提到两次。
awk代码是
NR == FNR { # During the first pass: remember the line count.
linecount = FNR # I'd like to use ENDFILE for this, but mawk doesn't
# know it. Oh well.
next # Do nothing else for the first pass.
}
FNR == 1 { # first line of the second pass:
chunksize = (linecount - 1) / files # calculate chunksize
h = $0 # remember header.
next # skip to data
}
FNR > nextsplit { # When we pass a split point:
close(filename) # close previous output file (if any)
++chunk # update chunk counter and build new output file name
filename = sprintf("%04d.smallfile.csv", chunk)
# and calculate the next split point. This is very simply
# chunksize * chunk rounded to the next integer. The rounding is important
# to ward of floating point rounding errors that might leave us with a
# one-line file at the end if 1234.9999999 were rounded down to 1234.
# offset of one because we don't count the header as part of the split data.
nextsplit = 1 + int(chunksize * chunk + 0.5)
# Then print the header to the new file
print h >> filename
}
{ print >> filename } # and print all lines to the file.
答案 1 :(得分:0)
# create a test bigfile
{ echo "header"; seq 6021; } > bigfile.csv
# strip the header
tmp=$(mktemp)
sed 1d bigfile.csv > "$tmp"
header=$( sed 1q bigfile.csv )
# split the file
N=100
split --suffix-length=${#N} --numeric-suffixes=1 --additional-suffix=.csv --number=r/$N $t smallfile.
# add the header to each file
sed -i "1i\\
$header" small*
并检查结果:
$ wc -l small*
62 smallfile.001.csv
62 smallfile.002.csv
62 smallfile.003.csv
62 smallfile.004.csv
62 smallfile.005.csv
62 smallfile.006.csv
62 smallfile.007.csv
62 smallfile.008.csv
62 smallfile.009.csv
62 smallfile.010.csv
62 smallfile.011.csv
62 smallfile.012.csv
62 smallfile.013.csv
62 smallfile.014.csv
62 smallfile.015.csv
62 smallfile.016.csv
62 smallfile.017.csv
62 smallfile.018.csv
62 smallfile.019.csv
62 smallfile.020.csv
62 smallfile.021.csv
61 smallfile.022.csv
61 smallfile.023.csv
61 smallfile.024.csv
61 smallfile.025.csv
61 smallfile.026.csv
61 smallfile.027.csv
61 smallfile.028.csv
61 smallfile.029.csv
61 smallfile.030.csv
61 smallfile.031.csv
61 smallfile.032.csv
61 smallfile.033.csv
61 smallfile.034.csv
61 smallfile.035.csv
61 smallfile.036.csv
61 smallfile.037.csv
61 smallfile.038.csv
61 smallfile.039.csv
61 smallfile.040.csv
61 smallfile.041.csv
61 smallfile.042.csv
61 smallfile.043.csv
61 smallfile.044.csv
61 smallfile.045.csv
61 smallfile.046.csv
61 smallfile.047.csv
61 smallfile.048.csv
61 smallfile.049.csv
61 smallfile.050.csv
61 smallfile.051.csv
61 smallfile.052.csv
61 smallfile.053.csv
61 smallfile.054.csv
61 smallfile.055.csv
61 smallfile.056.csv
61 smallfile.057.csv
61 smallfile.058.csv
61 smallfile.059.csv
61 smallfile.060.csv
61 smallfile.061.csv
61 smallfile.062.csv
61 smallfile.063.csv
61 smallfile.064.csv
61 smallfile.065.csv
61 smallfile.066.csv
61 smallfile.067.csv
61 smallfile.068.csv
61 smallfile.069.csv
61 smallfile.070.csv
61 smallfile.071.csv
61 smallfile.072.csv
61 smallfile.073.csv
61 smallfile.074.csv
61 smallfile.075.csv
61 smallfile.076.csv
61 smallfile.077.csv
61 smallfile.078.csv
61 smallfile.079.csv
61 smallfile.080.csv
61 smallfile.081.csv
61 smallfile.082.csv
61 smallfile.083.csv
61 smallfile.084.csv
61 smallfile.085.csv
61 smallfile.086.csv
61 smallfile.087.csv
61 smallfile.088.csv
61 smallfile.089.csv
61 smallfile.090.csv
61 smallfile.091.csv
61 smallfile.092.csv
61 smallfile.093.csv
61 smallfile.094.csv
61 smallfile.095.csv
61 smallfile.096.csv
61 smallfile.097.csv
61 smallfile.098.csv
61 smallfile.099.csv
61 smallfile.100.csv
6121 total