需要制作一个shell脚本来分割每个使用\n
作为分隔符的csv文件,每个文件的限制是单词数和
我不能把线切成两半。
示例:
sh SliceByWords.sh 1000 .
将每个文件切成1000个单词,并将每个部分放入子文件夹
function has_number_number_of_words {
re='^[0-9]+$'
if ! [[ $1 =~ $re ]] ; then
echo "error: Not a number, please run the command with the number of words per file" >&2; exit 1
fi
}
#MAIN
has_number_number_of_words $1
declare -i WORDLIMIT=$1 # N of lines to part each file
subdir="Result"
mkdir $subdir
format=*.csv
for name in $format; do mv "$name" "${name// /___}"; done
for i in $format;
do
if [[ "$i" == "$format" ]]
then
echo "No Files"
else
( locali=$(echo $i | awk '{gsub(/ /,"\\ ");print}');
localword=$i;
FILENAMEWITHOUTEXTENSION="${localword%.*}" ;
subnoext=$subdir"/"$FILENAMEWITHOUTEXTENSION;
echo Processing file "$FILENAMEWITHOUTEXTENSION";
awk -v NOEXT=$subnoext -v wl=$WORDLIMIT -F" " 'BEGIN{fn=1}{c+=NF}{sv=NOEXT"_snd_"fn".csv";print $0>sv;}c>wl{c=0;++fn;close(sv);}' $localword;
)&
fi
done
wait #wait
for name in $format; do mv "$name" "${name//___/ }"; done
echo All files done.
由于我无法弄清楚如何使用空格输入awk文件,即时通讯使用
for name in $format; do mv "$name" "${name//___/ }"; done
答案 0 :(得分:3)
我认为使用Counter(map(tuple, bi_grams[0:2])).items()
处理起来会容易得多:
awk
awk -F" " 'BEGIN{filenumber=1}{counter+=NF}{print $0 > FILENAME"_part_"filenumber} counter>1000{counter=0;++filenumber}' yourinputfile
在这里:
awk
分隔每一行-F" "
变量设置为1 filenumber
行中的字段数来使计数器变量变化{counter+=NF}
内置变量来遍历FILENAME
。 yourinputfile
{print $0 > FILENAME"_part_"filenumber}
变量加1 filenumber
最小化:
counter>1000{counter=0;++filenumber}