我必须在Folder.like中的子文件夹中处理.txt
个文件
New Folder>Folder 1 to 6>xx.txt & yy.txt(files present in each folder)
每个文件包含两列:
arg his
asp gln
glu his
和
arg his
glu arg
arg his
glu asp
现在我要做的是:
1)计算每个文件的每个单词的出现次数>平均总数除以total no. of lines in that file
2)然后使用完成第一步后获得的值,将值除以总数。用于平均的文件夹中存在的文件(在这种情况下为2)
我试过我的代码如下:
但我在第一个案例中取得了成功,但我没有得到第二个案例。
for root,dirs,files in os.walk(path):
aspCount = 0
glu_count = 0
lys_count = 0
arg_count = 0
his_count = 0
acid_count = 0
base_count = 0
count = 0
listOfFile = glob.iglob(os.path.join(root,'*.txt')
for filename in listOfFile:
lineCount = 0
asp_count_col1 = 0
asp_count_col2 = 0
glu_count_col1 = 0
glu_count_col2 = 0
lys_count_col1 = 0
lys_count_col2 = 0
arg_count_col1 = 0
arg_count_col2 = 0
his_count_col1 = 0
his_count_col2 = 0
count += 1
for line in map(str.split,inp):
saltCount += 1
k = line[4]
m = line[6]
if k == 'ASP':
asp_count_col1 += 1
elif m == 'ASP':
asp_count_col2 += 1
if k == 'GLU':
glu_count_col += 1
elif m == 'GLU':
glu_count_col2 += 1
if k == 'LYS':
lys_count_col1 += 1
elif m == 'LYS':
lys_count_col2 += 1
if k == 'ARG':
arg_count_col1 += 1
elif m == 'ARG':
arg_count_col2 += 1
if k == 'HIS':
his_count_col1 += 1
elif m == 'HIS':
his_count_col2 += 1
asp_count = (float(asp_count_col1 + asp_count_col2))/lineCount
glu_count = (float(glu_count_col1 + glu_count_col2))/lineCount
lys_count = (float(lys_count_col1 + lys_count_col2))/lineCount
arg_count = (float(arg_count_col1 + arg_count_col2))/lineCount
his_count = (float(his_count_col1 + his_count_col2))/lineCount
到此我可以获得每个文件的平均值。但是我怎么能够得到每个子文件夹的平均值(即通过除以count(文件总数))。 问题是第二部分。第一部分完成。提供的代码将是每个文件的平均值。但是我想添加这个平均值并通过除以总数来得到一个新的平均值。子文件夹中存在的文件。
答案 0 :(得分:1)
import os
from collections import *
aminoAcids = set('asp glu lys arg his'.split())
filesToCounts = {}
for root,dirs,files in os.walk(subfolderPath):
for file in files:
if file.endswith('.txt'):
path = os.path.join(root,file)
with open(path) as f:
acidsInFile = f.read().split()
assert all(a in aminoAcids for a in acidsInFile)
filesToCounts[file] = Counter(acidsInFile)
def averageOfCounts(counts):
numberOfAcids = sum(counts.values())
assert numberOfAcids%2==0
numberOfAcidPairs = numberOfAcids/2
return dict((acid,acidCount/numberOfAcidPairs) for acid,acidCount in counts.items())
filesToAverages = dict((file,averageOfCounts(counts)) for file,counts in filesToCounts.items())
答案 1 :(得分:0)
您与os.walk
一起使用glob.iglob
是假的。使用一个或另一个,而不是两者一起使用。以下是我将如何做到这一点:
import os, os.path, re, pprint, sys
#...
for root, dirs, files in os.walk(path):
counts = {}
nlines = 0
for f in filter(lambda n: re.search(r'\.txt$', n), files):
for l in open(f, 'rt'):
nlines += 1
for k in l.split():
counts[k] = counts[k]+1 if k in counts else 1
for k, v in counts.items():
counts[k] = float(v)/nlines
sys.stdout.write('Frequencies for directory %s:\n'%root
pprint.pprint(counts)
答案 2 :(得分:0)
我喜欢ninjagecko的回答,但不同地理解这个问题。使用他的代码作为起点,我建议:
import os
from collections import *
aminoAcids = set('asp glu lys arg his'.split())
subfolderFreqs = {}
for root,dirs,files in os.walk(subfolderPath):
cumulativeFreqs = defaultdict(int)
fileCount = 0
for file in files:
if file.endswith('.txt'):
fileCount += 1
path = os.path.join(root,file)
with open(path) as f:
acidsInFile = f.read().split()
counts = Counter(acidsInFile)
assert aminoAcids.issuperset(counts)
numberOfAcidPairs = len(acidsInFile)/2
for acid, acidCount in counts.items():
cumulativeFreqs[acid] += float(acidCount) / numberOfAcidPairs
if fileCount:
subfolderFreqs[root] = {acid: cumulative/fileCount for acid, cumulative in cumulativeFreqs.items()}
print subfolderFreqs