我有一些像这样格式化的大型(50k行)文件
chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829 0.473171 54 37 60 54 0 0 205
chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892 0.601108 73 116 101 71 0 0 361
chr1 35720 36081 NR_026820_exon_2_0_chr1_35721_r 0 - 0.398892 0.601108 73 116 101 71 0 0 361
chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895 0.428105 212 218 175 313 0 0 918
chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775 0.633225 997 1194 1924 809 0 0 4924
chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724 0.448276 13 12 14 19 0 0 58
chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610 0.524390 126 144 114 108 0 0 492
chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 323891 324060 NR_028325_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 323891 324060 NR_028327_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324287 324345 NR_028325_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324287 324345 NR_028327_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200 0.624800 400 1013 549 538 0 0 2500
chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228 0.621772 678 1580 996 889 0 0 4143
chr1 324438 328581 NR_028325_exon_2_0_chr1_324439_f 0 + 0.378228 0.621772 678 1580 996 889 0 0 4143
第2和第3列是开始和结束位置。我想删除所有具有相同开始和结束位置的行(其余行不重要)并且仅在第一次出现时保留。例如,我想在样本数据中保留第14行和第15行,因为即使起始位置相同,结尾也不是。 15和16具有相同的开始和结束,所以我想删除16.我试图用Python做,但我真的不知道如何处理两列唯一性要求。
有关最佳/最简单方法的任何想法吗?
答案 0 :(得分:2)
您可以使用 pandas 加载文件,然后根据需要删除基于2列的行(一个简单示例):
import pandas as pd
raw_data = {'firstcolumn': ['chr1', 'chr1', 'chr1'],
'start_position': [35276, 35720, 35720],
'end_position': [35481, 36081, 36081],
'A': [4, 4, 31],
'B': [25, 25, 57]}
df = pd.DataFrame(raw_data, columns = ['firstcolumn', 'start_position','end_position', 'A', 'B'])
df.drop_duplicates(['start_position','end_position']) #drop duplicate rows based on these 2 columns
答案 1 :(得分:1)
考虑使用优秀的pandas库加载和修改此类数据:
data_string = """
chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829 0.473171 54 37 60 54 0 0 205
chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892 0.601108 73 116 101 71 0 0 361
chr1 35720 36081 NR_026820_exon_2_0_chr1_35721_r 0 - 0.398892 0.601108 73 116 101 71 0 0 361
chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895 0.428105 212 218 175 313 0 0 918
chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775 0.633225 997 1194 1924 809 0 0 4924
chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724 0.448276 13 12 14 19 0 0 58
chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610 0.524390 126 144 114 108 0 0 492
chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 323891 324060 NR_028325_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 323891 324060 NR_028327_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324287 324345 NR_028325_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324287 324345 NR_028327_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200 0.624800 400 1013 549 538 0 0 2500
chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228 0.621772 678 1580 996 889 0 0 4143
chr1 324438 328581 NR_028325_exon_2_0_chr1_324439_f 0 + 0.378228 0.621772 678 1580 996 889 0 0 4143
"""
# this looks suspicously csv-like
import pandas
import StringIO
buf = StringIO.StringIO(data_string)
# this will create a DataFrame object with header: 0, 1, 2, ...
# if you have the file path, you can use that instead of the StringIO buffer
df = pandas.read_csv(buf, delim_whitespace=True, header=None)
>>> print df
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
2 chr1 35720 36081 NR_026820_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
8 chr1 323891 324060 NR_028325_exon_0_0_chr1_323892_f 0 + 0.426035
9 chr1 323891 324060 NR_028327_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
11 chr1 324287 324345 NR_028325_exon_1_0_chr1_324288_f 0 + 0.551724
12 chr1 324287 324345 NR_028327_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
15 chr1 324438 328581 NR_028325_exon_2_0_chr1_324439_f 0 + 0.378228
# ... more data skipped...
现在,这非常简单:
# drop duplicates for non-unique sets of values in columns 1, 2 (start, end)
no_dups = df.drop_duplicates([1, 2])
>>> print no_dups
0 1 2 3 4 5 6 \
0 chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829
1 chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892
3 chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895
4 chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775
5 chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724
6 chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610
7 chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035
10 chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724
13 chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200
14 chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228
答案 2 :(得分:0)
你可以使用类似的东西:
with open('out_lines.dat', 'w') as out_file:
with open('in_lines.dat', 'r') as in_file:
prev_start_end = []
for line in in_file:
line_data = line.split()
if line_data and len(line_data) == 15:
start = line_data[1]
end = line_data[2]
if (start, end) not in prev_start_end:
out_file.write(line)
prev_start_end.append((start, end))
输入数据的输出将为:
chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829 0.473171 54 37 60 54 0 0 205
chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892 0.601108 73 116 101 71 0 0 361
chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895 0.428105 212 218 175 313 0 0 918
chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775 0.633225 997 1194 1924 809 0 0 4924
chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724 0.448276 13 12 14 19 0 0 58
chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610 0.524390 126 144 114 108 0 0 492
chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200 0.624800 400 1013 549 538 0 0 2500
chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228 0.621772 678 1580 996 889 0 0 4143
答案 3 :(得分:0)
正如有人建议的那样,将“密钥”(起始字段)存储在一个集合中,如果密钥以前见过,则跳过打印行:
with open('datafile.tsv','r') as f:
for line in f:
fields=line.split('\t')
key=tuple(fields[1:3])
if key in s: continue
s.add(key)
print(line)
您可以将python的输出重定向到新文件。
答案 4 :(得分:0)
def unique_positions(filename):
with open(filename) as lines:
seen_positions = set()
for line in lines:
position = tuple(line.split()[1:3])
if position not in seen_positions:
seen_positions.add(position)
yield line
for line in unique_positions('data.csv):
print line
答案 5 :(得分:0)
如果您只是想删除欺骗并写入文件,您可以使用groupby,按两列分组并在旁边调用只获得多个匹配的第一行或唯一的行,无论哪种情况,它也存储内存很少:
from itertools import groupby
with open("in.csv") as f, open("out.csv", "w") as out:
for _, v in groupby(f, key=lambda x: x.split()[1:3]):
out.write(next(v))
输出:
chr1 35276 35481 NR_026820_exon_1_0_chr1_35277_r 0 - 0.526829 0.473171 54 37 60 54 0 0 205
chr1 35720 36081 NR_026818_exon_2_0_chr1_35721_r 0 - 0.398892 0.601108 73 116 101 71 0 0 361
chr1 69090 70008 NM_001005484_exon_0_0_chr1_69091_f 0 + 0.571895 0.428105 212 218 175 313 0 0 918
chr1 134772 139696 NR_039983_exon_0_0_chr1_134773_r 0 - 0.366775 0.633225 997 1194 1924 809 0 0 4924
chr1 139789 139847 NR_039983_exon_1_0_chr1_139790_r 0 - 0.551724 0.448276 13 12 14 19 0 0 58
chr1 140074 140566 NR_039983_exon_2_0_chr1_140075_r 0 - 0.475610 0.524390 126 144 114 108 0 0 492
chr1 323891 324060 NR_028322_exon_0_0_chr1_323892_f 0 + 0.426035 0.573964 37 41 56 35 0 0 169
chr1 324287 324345 NR_028322_exon_1_0_chr1_324288_f 0 + 0.551724 0.448276 19 15 11 13 0 0 58
chr1 324438 326938 NR_028327_exon_2_0_chr1_324439_f 0 + 0.375200 0.624800 400 1013 549 538 0 0 2500
chr1 324438 328581 NR_028322_exon_2_0_chr1_324439_f 0 + 0.378228 0.621772 678 1580 996 889 0 0 4143
如果要更改原始文件,请使用带有shutil.move的临时文件:
from itertools import groupby
from tempfile import NamedTemporaryFile
from shutil import move
with open("in.csv") as f, NamedTemporaryFile(dir=".",delete=False) as out:
for _, v in groupby(f, key=lambda x: x.split()[1:3]):
out.write(next(v))
move(out.name,"foo.csv")