Question

您好，我有两个文件

文件1：

chr5 20311169 20311244 5 20311177 20311251 K00230:40:HNWJLBBXX:4:1101:1002:35936 255 + - 20311210.00
chr5 26610220 26610295 5 26610221 26610296 K00230:40:HNWJLBBXX:4:1101:1022:24155 255 + - 26610258.00

file2

chr5     20311200    20311220   Nucleosome:1    110    5.0    39.9    MainPeak    1.43492858    0.68583064
chr5    801    861    Nucleosome:2    70    1.0    5.4    MainPeak    0.17076187    0.806538035
chr5    1021    1091    Nucleosome:3    80    2.0    14.4    MainPeak    0.42430331    0.481579895
chr5    1181    1251    Nucleosome:4    80    1.0    7.5    MainPeak    0.1362587    0.32626102999999995
chr5    1361    1441    Nucleosome:5    90    2.0    14.7    MainPeak    0.34212933    0.291726595
chr5    1621    1801    Nucleosome:6    190    2.0    26.1    MainPeak:doublet    0.37546564    0.353192625
chr5    2011    2071    Nucleosome:7    70    1.0    5.7    MainPeak    0.15091517    0.396369735
chr5    2161    2331    Nucleosome:8    180    1.0    17.2    MainPeak    0.08865312    0.42133046500000004
chr5    2441    2561    Nucleosome:9    130    2.5    25.3    MainPeak    0.7368501    0.48843276
chr5    2781    2851    Nucleosome:10    80    3.0    17.5    MainPeak    0.80818501    1.303005
chr5    3271    3431    Nucleosome:11    170    3.0    34.5    MainPeak+Shoulder    0.72967697    1.348257495
chr5    3521    3571    Nucleosome:12    60    1.0    5.8    MainPeak    0.1880739    0.504429705
chr5    3641    3791    Nucleosome:13    160    1.0    12.5    MainPeak:doublet    0.10098579    0.363148215

如果第11列的值在seconds文件中声明的开始和结束（第2列和第3列）的范围内，则我有兴趣使用python代码从文件1打印行。由于该位置仅在特定染色体（chr）中唯一，因此必须测试chr是否相同...因此，我想要的输出是

chr5 20311169 20311244 5 20311177 20311251 K00230:40:HNWJLBBXX:4:1101:1002:35936 255 + - 20311210.00

我已经尝试过awk代码。它工作得很好，但是它们非常慢！

我正在测试的文件（我需要从中打印行大约4 GB）。

如果能提供一些python代码，我将不胜感激

谢谢！

Answer 1

一个简单的函数可以从您的文本中提取第N列，这使得该过程相当简单。我假设您说的是“第11列”，是指从1开始计数的11列，而不是第1项为index-0的index-11列。

伪代码：

Until there's no data left ~
    Read line1 from file1
    Read line2 from file2
    Extract Col11 from line1 as a real number
    Extract Col2 & Col3 from line2 as real numbers
    IF Col11 is within Col2 & Col3
        do something

Python代码：

import sys

# Given a space-separated row of data, return the Nth column as a real number
def getNthColumn(row, N):
    # Single-space the row, removing tabs, double-spaces etc.
    row = ' '.join(row.split())
    fields = row.split(' ')
    result = float(fields[N-1])   # fields are numbered 0->(N-1)
    #print("Returning column %d from [%s] -> %f" % (N, row, result))
    return result

if (len(sys.argv) == 3):
    fin1 = open(sys.argv[1], "rt")
    fin2 = open(sys.argv[2], "rt")  #TODO - handle file-not-found errors, etc.

    line1 = fin1.readline()
    line2 = fin2.readline()
    while (line1 != "" and line2 != ""):
        # Get the columns from the two lines
        f1_col11 = getNthColumn(line1, 11)
        f2_col2  = getNthColumn(line2,  2)
        f2_col3  = getNthColumn(line2,  3)  ### TODO handle errors
        # work out if it's a keeper
        # print("Is %f >= %f and %f <= %f" % (f1_col11, f2_col2, f1_col11, f2_col3))
        if (f1_col11 >= f2_col2 and f1_col11 <= f2_col3):
            print("MATCH: "+line1)
        else:
            print("NO-MATCH: "+line1)
        # Next rows
        line1 = fin1.readline()
        line2 = fin2.readline()
else:
    print("Give 2 files as arguments")

说实话，如果速度真的很关键，那么最好以编译语言（例如C / C ++ / Pascal等）编写。

编辑：经过测试，可以正常工作，添加了一些调试print（）s

EDIT2：针对file2中的所有行搜索file1行

import sys

# Hold all the file2 Columns
file2_col23 = []

# Given a space-separated row of data, return the Nth column as a real number
def getNthColumn(row, N):
    # Single-space the row, removing tabs, double-spaces etc.
    row = ' '.join(row.split())
    fields = row.split(' ')
    try:
        result = float(fields[N-1])   # fields are numbered 0->(N-1)
    except:
        sys.stderr.write("Failed to fetch number column %d from [%s]" % (N, row))
        sys.exit(1)
    #print("Returning column %d from [%s] -> %f" % (N, row, result))
    return result

if (len(sys.argv) == 3):
    fin1 = open(sys.argv[1], "rt")
    fin2 = open(sys.argv[2], "rt")  #TODO - handle file-not-found errors, etc.

    # Load in the whole of file2, but just the column2 & column3
    # note the minimum col2 and maximum c3
    line2 = fin2.readline()
    min_c2 = None
    max_c3 = None
    while (line2 != ""):
        col2 = getNthColumn(line2, 2)
        col3 = getNthColumn(line2, 3)
        file2_col23.append( ( col2, col3 ) )
        # Note the min c2 and max c3 so we can quickly know if a search can
        # possible produce a result
        if (min_c2 == None or col2 < min_c2):
            min_c2 = col2
        if (max_c3 == None or col3 > max_c3):
            max_c3 = col3
        # next line
        line2 = fin2.readline().strip()

    # sort the columns to allow us to short-cut searching
    file2_col23.sort()


    line1 = fin1.readline()
    while (line1 != ""):
        col11 = getNthColumn(line1, 11)

        matched = False
        # is col11 is within any file2 row col2 or col3
        if (col11 >= min_c2 and col11 <= max_c3):   # make sure the search is worthwhile
            for col23 in file2_col23:
                (col2, col3) = col23
                if (col11 >= col2 and col11 <= col3):
                    matched = True
                    break

        if (matched == True):
            print("MATCH: "+str(line1))
        else:
            print("NO-MATCH: "+str(line1))

        # Next row
        line1 = fin1.readline()
else:
    print("Give 2 files as arguments")

如果第1列匹配，则提取具有第11列值的行位于第二个文件的第二个和第3个文件之间

1 个答案: