Question

正如标题所示，我正在尝试查找大型tsv文件中的所有行，其中至少50％的列的值大于使用awk的值x。

e.g for x = 5：

9    6    7    2     3
0    1    2    7     6
1    3    8    9    10

应该返回

9    6    7    2     3
1    3    8    9    10

Answer 1

要求救援！

$ awk -v t=5 '{c=0; for(i=1;i<=NF;i++) c+=($i>t)} c/NF>0.5' file

9    6    7    2     3
1    3    8    9    10

Answer 2

使用Perl：

library(SimilarityMeasures)
data <- data.frame(IndexNo = 1:13, Latitude = rnorm(13,130,1),Longitude = rnorm(13,30,1),TripID = c("A","A","A","B","B","B","C","C","D","E","E","E","E"))
LIST = sort(unique(data$TripID))
ddist = data.table(matrix(0,length(LIST),length(LIST)))

for(i in 1:length(LIST)){
    for(j in 1:length(LIST)){
        data3 = data[data$TripID==LIST[i],]
        data4 = data[data$TripID==LIST[j],] 
        traj1=cbind(data3$Latitude,data3$Longitude)
        traj2=cbind(data4$Latitude,data4$Longitude)
        ddist[i,j] = as.numeric(DTW(as.matrix(traj1),as.matrix(traj2)))
    }
}
ddist

Answer 3

使用输入.tsv文件，如下所示：

#!/usr/bin/awk -f

# reads from stdin. 
# Usage: $ ./bigcols.awk < input1.tsv


# Run at start.
BEGIN {
#       print "Start" 
#       print "TSV setting. Field seperator set to tab."
        FS = "\t"
        # He wants to find lines with avg greater than var x
        x=5
}

# main. Run for each record. This code uses newlines to denote records.
{
        # Find lines which are of this form: (skip header)
        # #+,
        # ie. start with one or more numbers in column 1.
        if ($1 ~ /^[0-9]+/) {
                the_avg = ($1 + $2 + $3 + $4 + $5)/5
                if (the_avg > x) {
                    print $1, $2, $3, $4, $5
                }
        }
}

# run at end
#END { print "Stop" }

此代码将在awk脚本中执行。我留下了评论脚本的形式，以便您可以相应地进行调整。

{{1}}

awk - 列的一半大于x的所有行

3 个答案: