我有以下文字
' 14411.7647 e0 - 2647.0588 e3 + 7352.9412 e12 + 14411.7647 e123 21828.2063'
' - 2647.0588 e3 + 7352.9412 e12 7814.9002'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 0.0000 e0 + 0.0000 e123 1.9293e-12'
' 14411.7647'
,我想根据eXXX字词对齐表格。这可能是示例输出:
' 14411.7647 e0 - 2647.0588 e3 + 7352.9412 e12 + 14411.7647 e123 21828.2063'
' - 2647.0588 e3 + 7352.9412 e12 7814.9002'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 0.0000 e0 + 0.0000 e123 1.9293e-12'
' 14411.7647'
最重要的部分是将eXXX项及其系数对齐。
更新:列最初用空格分隔。例如,输出可以用制表符分隔。
UPDATE2:第一行表示总列数。没有比第一行多的列。第二行和后面几行中的exxx可以与第一行中的exxx相同或不同,但是您永远找不到比第一行中更多的术语,也不会无序排列(即e12总是在e3之后)
这可以使用awk或类似工具来实现吗?
答案 0 :(得分:4)
$ cat tst.awk
BEGIN { OFS="\t" }
{
# Get rid of all single quotes at the start/end of lines
gsub(/^\047|\047$/,"")
# Attach the +/- sign when present to the number to its right
# to normalize how the fields are presented on each line.
gsub(/\+ /,"+")
gsub(/- /,"-")
}
NR==1 {
# Consider each pair like "14411.7647 e0" to be one field with
# "e0" as the key that determines the output order for that field
# and "14411.7647" as the value associated with that key. Here
# we create an array that remembers the order of the keys.
for (i=1; i<=NF; i+=2) {
key = $(i+1)
fldNr2key[++numFlds] = key
}
}
{
# Populate an array that maps the key to its value
delete key2val
for (i=1; i<=NF; i+=2) {
key = $(i+1)
val = $i
key2val[key] = val
}
# Print the values by the order of the keys
out = ""
for (fldNr=1; fldNr<=numFlds; fldNr++) {
key = fldNr2key[fldNr]
fld = ""
if (key in key2val) {
val = key2val[key]
fld = val (key ~ /./ ? " " key : "")
sub(/^[-+]/,"& ",fld) # restore the blank after a leading +/-
}
out = out fld (fldNr<numFlds ? OFS : "")
}
print "\047 " out "\047"
}
制表符分隔的输出:
$ awk -f tst.awk file
' 14411.7647 e0 - 2647.0588 e3 + 7352.9412 e12 + 14411.7647 e123 21828.2063'
' - 2647.0588 e3 + 7352.9412 e12 7814.9002'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 0.0000 e0 + 0.0000 e123 1.9293e-12'
' 14411.7647'
可视化表格输出(或对脚本中的每个字段使用适当宽度的printfs):
$ awk -f tst.awk file | column -s$'\t' -t
' 14411.7647 e0 - 2647.0588 e3 + 7352.9412 e12 + 14411.7647 e123 21828.2063'
' - 2647.0588 e3 + 7352.9412 e12 7814.9002'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 0.0000 e0 + 0.0000 e123 1.9293e-12'
' 14411.7647'
答案 1 :(得分:2)
看起来字段可以被多个空格分割,然后您可以尝试使用FS =“ *\047 *| +
”,这样,最终的预期行(基于NR==1
)可以分割为eXXX列(从$2
到$(NF-2)
),如果在$(NF-1)
存在常规列。 $ 1和$ NF始终为空。
$ cat t17.1.awk
BEGIN{ FS = " *\047 *| +"; OFS = "\t"; }
# on the first line, set up the total N = NF
# the keys and value lengths for the 'eXXX' cols
# to sort and format fields for all rows
NR == 1 {
N = NF
for (i=2; i < N-1; i++) {
n1 = split($i, a, " ")
e_cols[i] = a[n1]
e_lens[i] = length($i)
}
# the field-length of the regular column which is non eXXX-cols
len_last = length($(NF-1))
}
{
printf "\047 "
# hash the e-key for field from '2' to 'NF-1'
# include NF-1 in case the last regular column is missing
for (i=2; i < NF; i++) {
n1 = split($i, a, " ")
hash[a[n1]] = $i
}
# print the eXXX-cols based on the order as in NR==1
for (i=2; i < N-1; i++) {
printf("%*s%s", e_lens[i], hash[e_cols[i]], OFS)
}
# print the regular column at $(NF-1) or EMPTY if it is an eXXX-cols
printf("%*s\047\n", len_last, match($(NF-1),/ e[0-9]+$/)?"":$(NF-1))
# reset the hash
delete hash
}
运行上面的脚本,您将得到以下结果:(注意,我追加了一行,这样eXXX-cols + 14411.7647 e123
位于行尾'
之前的行尾)
$ awk -f t17.1.awk file.txt
' 14411.7647 e0 - 2647.0588 e3 + 7352.9412 e12 + 14411.7647 e123 21828.2063'
' - 2647.0588 e3 + 7352.9412 e12 7814.9002'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 14411.7647 e0 + 14411.7647 e123 20381.3131'
' 0.0000 e0 + 0.0000 e123 1.9293e-12'
' 14411.7647'
' + 14411.7647 e123 '
注意:
您可能需要 gawk 才能使"%*s"
对printf()
有效,如果不起作用,请尝试使用固定的数字,例如:{{1} }
电子校对中的某些值的大小可能比NR == 1时对应的值长,要解决此问题,您可以手动指定长度数组或仅使用固定数字