我正在使用txt文件。我需要根据基因名称对数据进行分组,并确定基因名称的每列中有多少非零值。
我所拥有的是不允许我比较下划线之前的字符,以检查它们是否在同一个基因组中。
非常感谢任何帮助或建议。
答案 0 :(得分:0)
如果您能够将整个数据集加载到内存中,最好的方法是使用字典按基因名称分组:
In [10]: import io
In [11]: from collections import defaultdict
In [12]: file = io.StringIO(s) # pretend I'm a file
In [13]: grouper = defaultdict(lambda: {'X1':[], 'X2':[], 'X3':[]})
In [14]: next(file) # skip header
Out[14]: 'Gene Name X1 X2 X3\n'
In [15]: for line in file:
...: row = line.split()
...: name, delim, seq = row[0].partition('_')
...: x1, x2, x3 = map(float, row[1:])
...: columns = grouper[name]
...: columns['X1'].append(x1)
...: columns['X2'].append(x2)
...: columns['X3'].append(x3)
...:
In [16]: grouper
Out[16]:
defaultdict(<function __main__.<lambda>>,
{'A1BG': {'X1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
'X2': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
'X3': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]},
'A1CF': {'X1': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
'X2': [0.0, 0.0, 0.0, 0.0, 0.0, 3.2],
'X3': [0.0, 0.0, 0.0, 0.0, 0.0, 4.9]}})
然后您可以使用如下结果:
In [17]: sum(x != 0 for x in grouper['A1BG']['X1'])
Out[17]: 0
In [18]: sum(x != 0 for x in grouper['A1BG']['X2'])
Out[18]: 0
In [19]: sum(x != 0 for x in grouper['A1BG']['X3'])
Out[19]: 0
In [20]: sum(x != 0 for x in grouper['A1CF']['X1'])
Out[20]: 0
In [21]: sum(x != 0 for x in grouper['A1CF']['X2'])
Out[21]: 1
In [22]: sum(x != 0 for x in grouper['A1CF']['X3'])
Out[22]: 1
修改如果您想使用pandas:
In [28]: import pandas as pd
In [29]: file = io.StringIO(s) # pretend I'm a file
In [30]: df = pd.read_csv(file, delim_whitespace=True, skiprows=[0], header=None, names=['Gene Name', 'X1','X2','X3'])
In [31]: df
Out[31]:
Gene Name X1 X2 X3
0 A1BG_AAGAGCGCCTCGGTCCCAGC 0 0.0 0.0
1 A1BG_CAAGAGAAAGACCACGAGCA 0 0.0 0.0
2 A1BG_CACCTTCGAGCTGCTGCGCG 0 0.0 0.0
3 A1BG_CACTGGCGCCATCGAGAGCC 0 0.0 0.0
4 A1BG_GCTCGGGCTTGTCCACAGGA 0 0.0 0.0
5 A1BG_TGGACTTCCAGCTACGGCGC 0 0.0 0.0
6 A1CF_CCAAGCTATATCCTGTGCGC 0 0.0 0.0
7 A1CF_CGTGGCTATTTGGCATACAC 0 0.0 0.0
8 A1CF_GACATGGTATTGCAGTAGAC 0 0.0 0.0
9 A1CF_GAGTCATCGAGCAGCTGCCA 0 0.0 0.0
10 A1CF_GGTATACTCTCCTTGCAGCA 0 0.0 0.0
11 A1CF_GGTGCAGCATCCCAACCAGG 0 3.2 4.9
In [32]: df['name'] = df['Gene Name'].str.extract(r'(.*)_.*')
In [33]: df
Out[33]:
Gene Name X1 X2 X3 name
0 A1BG_AAGAGCGCCTCGGTCCCAGC 0 0.0 0.0 A1BG
1 A1BG_CAAGAGAAAGACCACGAGCA 0 0.0 0.0 A1BG
2 A1BG_CACCTTCGAGCTGCTGCGCG 0 0.0 0.0 A1BG
3 A1BG_CACTGGCGCCATCGAGAGCC 0 0.0 0.0 A1BG
4 A1BG_GCTCGGGCTTGTCCACAGGA 0 0.0 0.0 A1BG
5 A1BG_TGGACTTCCAGCTACGGCGC 0 0.0 0.0 A1BG
6 A1CF_CCAAGCTATATCCTGTGCGC 0 0.0 0.0 A1CF
7 A1CF_CGTGGCTATTTGGCATACAC 0 0.0 0.0 A1CF
8 A1CF_GACATGGTATTGCAGTAGAC 0 0.0 0.0 A1CF
9 A1CF_GAGTCATCGAGCAGCTGCCA 0 0.0 0.0 A1CF
10 A1CF_GGTATACTCTCCTTGCAGCA 0 0.0 0.0 A1CF
11 A1CF_GGTGCAGCATCCCAACCAGG 0 3.2 4.9 A1CF
In [34]: template = "For gene {}, X1 count: {X1}, X2 count: {X2}, X3 count: {X3}"
...: for name, group in df.groupby('name'):
...: print(template.format(name, **group.apply(np.count_nonzero)))
...:
For gene A1BG, X1 count: 0, X2 count: 0, X3 count: 0
For gene A1CF, X1 count: 0, X2 count: 1, X3 count: 1
答案 1 :(得分:0)
您可以使用groupby
模块中的itertools
以及literal_eval
模块中的ast
和any()
,例如:
from itertools import groupby
from ast import literal_eval as le
# I'm assuming your input file is called 'input.txt'
# which contains the data you gave in your question
with open('input.txt', 'r') as fp:
data = [k.split() for k in fp.read().splitlines()]
sub = {}
for k, v in groupby(sorted(data[1:], key= lambda x: x[0].split('_')[0]), lambda x: x[0].split('_')[0]):
# Remove the 'x3' field if you don't need their results in your code
_, x1, x2, x3 = list(zip(*list(v)))
sub[k] = {'x1': x1, 'x2': x2, 'x3': x3}
for k in sub:
for j in sub[k]:
# if any values of the fields 'x1', 'x2' or 'x3' != 0 it will retuen 1
# otherwise it will return 0
print("{}:{}: {}".format(k, j, 1 if any(le(m) for m in sub[k][j]) else 0))
输出:
A1BG:x1: 0
A1BG:x3: 0
A1BG:x2: 0
A1CF:x1: 0
A1CF:x3: 1
A1CF:x2: 1
答案 2 :(得分:0)
快速而肮脏:
>>> genes
[['A1BG_AAGAGCGCCTCGGTCCCAGC', '0', '0', '0'],
['A1BG_CAAGAGAAAGACCACGAGCA', '0', '0', '0'],
['A1BG_CACCTTCGAGCTGCTGCGCG', '0', '0', '0'],
['A1BG_CACTGGCGCCATCGAGAGCC', '0', '0', '0'],
['A1BG_GCTCGGGCTTGTCCACAGGA', '0', '0', '0'],
['A1BG_TGGACTTCCAGCTACGGCGC', '0', '0', '0'],
['A1CF_CCAAGCTATATCCTGTGCGC', '0', '0', '0'],
['A1CF_CGTGGCTATTTGGCATACAC', '0', '0', '0'],
['A1CF_GACATGGTATTGCAGTAGAC', '0', '0', '0'],
['A1CF_GAGTCATCGAGCAGCTGCCA', '0', '0', '0'],
['A1CF_GGTATACTCTCCTTGCAGCA', '0', '0', '0'],
['A1CF_GGTGCAGCATCCCAACCAGG', '0', '3.2', '4.9']]
>>> results = {}
>>> for gene in genes:
... if(gene[0][0:4] in results and (float(gene[1])!=0.0 or float(gene[2])!=0.0 or float(gene[3])!=0.0)):
... results[gene[0][0:4]]+=1
... elif(gene[0][0:4] not in results and (float(gene[1])!=0.0 or float(gene[2])!=0.0 or float(gene[3])!=0.0)):
... results[gene[0][0:4]]=1
... else:
... pass
>>> results
{'A1CF': 1}