组织不一致的值

时间:2013-08-22 20:29:25

标签: sorting csv text awk

不知道这是否可以在这里问,因为它不是编程但我不知道还能去哪里: 我想以一致的方式组织以下数据。目前这是一团糟,只有前两列(逗号分隔)一致。其余列的编号可以是1-9,通常不同。 换句话说,我想对它进行排序,使文本匹配(一行中的所有值列,连续的所有反冲列等)。然后我可以删除文本并添加标题,它仍然有意义。

bm_wp_upg_o_t1micro, sight, value = 3, zoom = 3, recoil = 1, spread_moving = -1 
bm_wp_upg_o_marksmansight_rear, sight, value = 3, zoom = 1, recoil = 1, spread = 1 
bm_wp_upg_o_marksmansight_front, extra, value = 1 
bm_wp_m4_upper_reciever_edge, upper_reciever, value = 3, recoil = 1 
bm_wp_m4_upper_reciever_round, upper_reciever, value = 1 
bm_wp_m4_uupg_b_long, barrel, value = 4, damage = 1, spread = 1, spread_moving = -2, concealment = -2 

任何建议(即使是在正确的位置实际问这个的地方)都会很棒。 上下文只是从我正在尝试组织的游戏文件中删除的原始数据。

3 个答案:

答案 0 :(得分:1)

我担心正则表达式对你的帮助不大,因为你输入的不规则性(它可以匹配它,但是它可能是一个让所有这些都安排好的熊) 。这可以通过任何编程语言轻松完成,但对于这样的东西,我总是去awk

假设您的输入位于名为input.txt的文件中,请将以下内容放入名为parse.awk的程序中:

BEGIN {
    FS=" *, *";
    formatStr = "%32s,%8s,%8s,%8s,%10s,%16s,%8s,%18s,%10s,%10s,%16s,%16s\n";
    printf( formatStr, "id", "sight", "value", "zoom", "recoil", "spread_moving", "extra", "upper_receiver", "barrel", "damage", "spread_moving", "concealment" );
}

{
    split("",a);
    for( i=2; i<=NF; i++ ) {
        if( split( $(i), kvp, " *= *" ) == 1 ) {
            a[kvp[1]] = "x";
        } else {
            a[kvp[1]] = gensub( /^\s*|\s*$/, "", "g", kvp[2] );
        }
    }
    printf( formatStr, $1, a["sight"], a["value"], a["zoom"], a["recoil"],
        a["spread_moving"], a["extra"], a["upper_receiver"], 
        a["barrel"], a["damage"], a["spread_moving"], a["concealment"] );
}

对它运行awk:

awk -f parse.awk input.txt

获得你的输出:

                              id,   sight,   value,    zoom,    recoil,   spread_moving,   extra,    upper_receiver,    barrel,    damage,   spread_moving,     concealment
             bm_wp_upg_o_t1micro,       x,       3,       3,         1,              -1,        ,                  ,          ,          ,              -1,
  bm_wp_upg_o_marksmansight_rear,       x,       3,       1,         1,                ,        ,                  ,          ,          ,                ,
 bm_wp_upg_o_marksmansight_front,        ,       1,        ,          ,                ,       x,                  ,          ,          ,                ,
    bm_wp_m4_upper_reciever_edge,        ,       3,        ,         1,                ,        ,                  ,          ,          ,                ,
   bm_wp_m4_upper_reciever_round,        ,       1,        ,          ,                ,        ,                  ,          ,          ,                ,
            bm_wp_m4_uupg_b_long,        ,       4,        ,          ,              -2,        ,                  ,         x,         1,              -2,              -2

请注意,我选择只使用'x'作为视线,这似乎是现在/不存在的事情。你可以在那里使用任何你想要的东西。

如果您使用的是Linux或Macintosh,则应该有awk可用。如果您使用的是Windows,则必须安装它。

答案 1 :(得分:1)

我确实制作了另一个awk版本。我认为这应该更容易阅读。 从文件中读取所有值/列,使其尽可能动态。

awk -F, '
    {
    ID[$1]=$2                   # use column 1 as index
    for (i=3;i<=NF;i++ )        # loop through all fields from #3 to end
        {
        gsub(/ +/,"",$i)            # remove space from field
        split($i,a,"=")         # split field in name and value a[1] and a[2]
        COLUMN[a[1]]++          # store field name as column name
        DATA[$1" "a[1]]=a[2]    # store data value in DATA using field #1 and column name as index
        }
    } 
END {
    printf "%49s   ","info"     # print info
    for (i in COLUMN)
        {printf "%15s",i}       # print column name
    print ""
    for (i in ID)               # loop through all ID
        {
        printf "%32s %16s ",i, ID[i]    # print ID and info
        for (j in COLUMN)
            {
            printf "%14s ",DATA[i" "j]+0    # print value
            }
        print ""
        }
    }' file

输出

                                             info            spread         recoil           zoom    concealment  spread_moving         damage          value
   bm_wp_m4_upper_reciever_round   upper_reciever              0              0              0              0              0              0              1
            bm_wp_m4_uupg_b_long           barrel              1              0              0             -2             -2              1              4
  bm_wp_upg_o_marksmansight_rear            sight              1              1              1              0              0              0              3
 bm_wp_upg_o_marksmansight_front            extra              0              0              0              0              0              0              1
    bm_wp_m4_upper_reciever_edge   upper_reciever              0              1              0              0              0              0              3
             bm_wp_upg_o_t1micro            sight              0              1              3              0             -1              0              3

答案 2 :(得分:0)

坚持Ethan的回答 - 这只是我享受自己。 (是的,这让我非常奇怪!)

awk脚本

awk 'BEGIN  {
                # f_idx[field] holds the column number c for a field=value item
                # f_name[c]    holds the names
                # f_width[c]   holds the width of the widest value (or the field name)
                # f_fmt[c]     holds the appropriate format
                FS = " *, *"; n = 2;
                f_name[0] = "id";   f_width[0] = length(f_name[0])
                f_name[1] = "type"; f_width[1] = length(f_name[1])
            }
            {
                #-#print NR ":" $0
                line[NR,0] = $1
                len = length($1)
                if (len > f_width[0])
                    f_width[0] = len
                line[NR,1] = $2
                len = length($2)
                if (len > f_width[1])
                    f_width[1] = len
                for (i = 3; i <= NF; i++)
                {
                    split($i, fv, " = ")
                    #-#print "1:" fv[1] ", 2:" fv[2]
                    if (!(fv[1] in f_idx))
                    {
                        f_idx[fv[1]] = n
                        f_width[n++] = length(fv[1])
                    }
                    c = f_idx[fv[1]]
                    f_name[c] = fv[1]
                    gsub(/ /, "", fv[2])
                    len = length(fv[2])
                    if (len > f_width[c])
                        f_width[c] = len
                    line[NR,c] = fv[2]
                    #-#print c ":" f_name[c] ":" f_width[c] ":" line[NR,c]
                }
            }
     END    {
                for (i = 0; i < n; i++)
                    f_fmt[i] = "%s%" f_width[i] "s"
                #-#for (i = 0; i < n; i++)
                #-#    printf "%d: (%d) %s %s\n", i, f_width[i], f_name[i], f_fmt[i]
                #-#    pad = ""
                for (j = 0; j < n; j++)
                {
                    printf f_fmt[j], pad, f_name[j]
                    pad = ","
                }
                printf "\n"
                for (i = 1; i <= NR; i++)
                {
                    pad = ""
                    for (j = 0; j < n; j++)
                    {
                        printf f_fmt[j], pad, line[i,j]
                        pad = ","
                    }
                    printf "\n"
                }
            }' data

此脚本适应它在文件中找到的数据。它将列标题“id”分配给输入的第1列,并将“type”分配给第2列。对于第3..N列中的每组值,它将数据拆分为键(在{{1}中) })和值(在fv[1]中)。如果之前未看到该键,则会为其分配一个新的列号,并将该键存储为列名,并将键的宽度存储为初始列宽。然后将该值存储在该行中的相应列中。

当读取所有数据时,脚本知道列标题将是什么。然后它可以创建一组格式字符串。然后它打印标题和所有数据行。如果您不想要固定宽度输出,那么您可以大大简化脚本。可以对此脚本进行一些(主要是次要的)简化。

数据文件

fv[2]

输出

bm_wp_upg_o_t1micro, sight, value = 3, zoom = 3, recoil = 1, spread_moving = -1 
bm_wp_upg_o_marksmansight_rear, sight, value = 3, zoom = 1, recoil = 1, spread = 1 
bm_wp_upg_o_marksmansight_front, extra, value = 1 
bm_wp_m4_upper_receiver_edge, upper_receiver, value = 3, recoil = 1 
bm_wp_m4_upper_receiver_round, upper_receiver, value = 1 
bm_wp_m4_uupg_b_long, barrel, value = 4, damage = 1, spread = 1, spread_moving = -2, concealment = -2