我在我的代码中遇到问题我正在尝试读取一个fasta文件,即“chr1.fa”,然后我有一个看起来像这样的突变文件
#!/usr/bin/python
# Filename: mutation.py
import sys , os
import numpy as np
import re
#declaring the variables
lst = ''
chr_name = ''
first_cord = ''
second_cord = ''
lstFirstCord = []
sequence = ''
human_genome = ''
seqList = ''
# Method to read the Genome file (file contains data for only one chromosome)
def ReadgenomeCharacter():
header = ' '
seq = ' '
try:
human_genome = raw_input("Enter UCSC fasta file of human genome:")
human_seq = open(human_genome, 'rw+')
line = human_seq.readline()
except:
print 'File cannot be opened, wrong format you forgot something:', human_genome
exit()
while line:
line = line.rstrip('\n')
if '>' in line:
header = line
else:
seq = seq + line
line = human_seq.readline()
print header
print "Length of the chromosome is:",len(seq)
print "No. of N in the chromosome are:", seq.count("N")
return seq
#Method to replace the characters in sequence string
def ReplaceSequence():
seqList = list(sequence)
for index, item in enumerate(lstFirstCord):
if seqList[index] != "N":
seqList[index] = "N"
newSequence = ''.join(seqList)
return newSequence
#Method to write to genome file
def WriteToGenomeFile(newSequence):
try:
with open("chr1.fa", 'rw+') as f:
old_human_seq = f.read()
f.seek(0)
f.write(newSequence)
print "Data modified in the genome file"
print "Length of the new chromosome is:",len(newSequence)
print "No. of N in the new chromosome are:", newSequence.count("N")
except:
print 'File cannot be opened, wrong format you forgot something:', human_genome
exit()
sequence = ReadgenomeCharacter()
print "Here is my mutaiton file data"
data = np.genfromtxt("CLL608.txt",delimiter ="\t", dtype=None,skip_header=0) #Reading the mutation file CLL608.txt
#Storing the mutation file data in a dictionary
subarray = ({'Chr1' : data[data[:,0] == 'chr1'],'Chr2': data[data[:,0] == 'chr2'],'Chr3': data[data[:,0] == 'chr3'],
'Chr4': data[data[:,0] == 'chr4'], 'Chr5': data[data[:,0] == 'chr5'],'Chr6': data[data[:,0] == 'chr6'],
'Chr7': data[data[:,0] == 'chr7'], 'Chr8': data[data[:,0] == 'chr8'],'Chr9': data[data[:,0] == 'chr9'],
'Chr10': data[data[:,0] == 'chr10'] , 'Chr11': data[data[:,0] == 'chr11'],'Chr12': data[data[:,0] == 'chr12'],
'Chr13': data[data[:,0] == 'chr13'], 'Chr14': data[data[:,0] == 'chr14'],'Chr15': data[data[:,0] == 'chr15'],
'Chr16': data[data[:,0] == 'chr16'],'Chr17': data[data[:,0] == 'chr17'],'Chr18': data[data[:,0] == 'chr18'],
'Chr19': data[data[:,0] == 'chr19'], 'Chr20': data[data[:,0] == 'chr20'],'Chr21': data[data[:,0] == 'chr21'],
'Chr22': data[data[:,0] == 'chr22'], 'ChrX': data[data[:,0] == 'chrX']})
#For each element in the dictionary, fetch the first cord and pass this value to the method to replace the character on first chord with N in the genome file
for the_key, the_value in subarray.iteritems():
cnt = len(the_value)
for lst in the_value:
chr_name = lst[0]
first_cord = int(lst[1])
second_cord = int(lst[2])
lstFirstCord.append(first_cord)
#Call the method to replace the sequence
newSeq = ReplaceSequence()
print "length :", len(newSeq)
#Call the method to write new data to genome file
WriteToGenomeFile(newSeq)
`
这是一个制表符分隔文件,其中chr1为第一列,+为最后一列。我想在chr1.fa文件中使用第二列中的位置插入一个N.我的代码看起来像这样
Enter UCSC fasta file of human genome:chr1.fa
chr1
Length of the chromosome is: 249250622
No. of N in the chromosome are: 23970000
Here is my mutaiton file data
length : 249250622
File cannot be opened, wrong format you forgot something:
我正在获得这样的输出
rsync -avzP
rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/chr1.fa.gz .
我们可以直接输入以下命令来下载chr1.fa
HTML
不知何故,我无法在序列中插入N,也无法对新序列进行分析。 如果有任何改进代码的宝贵建议,我将很高兴:)
答案 0 :(得分:0)
为了让您的生活更轻松,您可能需要考虑使用Biopython阅读您的法令并进行转换。
以下是一些帮助您入门的文档http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc16
这是一些初学者代码。
from Bio import SeqIO
handle = open("example.fasta", "rU")
output_handle = open("output.fasta", "w")
for record in SeqIO.parse(handle, "fasta"):
print record.seq
handle.close()
output_handle.close()
答案 1 :(得分:0)
您可能在查找文件目录和打开文件时遇到一些问题。话虽这么说,一旦你掌握了文件数据,你的工作就相对容易了。您想要读取fasta文件,删除标题并将其转换为列表,然后只需将变异文件中的索引替换为" N"并重新创建fasta。以下是步骤:
from collections import defaultdict
chromosome = input("what chromosome are you editing? ")
# have all your file paths in order
mutations = path/to/mutations/file
fasta = path/to/fasta/file
newfile = path/to/new/file
# (1) get the mutations out of the mutations file into a list for each chrom
mutdict = defaultdict(list)
with open(mutations, "r") as f1:
muts = f1.readlines() # read all lines into list
muts = [(x[0], int(x[1])) for x in muts] # get the two columns you want
# (2) convert these to a dict
for (ch, pos) in muts:
mutdict[ch].append(pos)
# (3) get your fasta and convert it to a list
with open(fasta, "r") as f2:
header = f2.readline() # the first line is a header like ">chr1"
bases = f2.read().replace("\n", "") # read all the bases and remove "\n"
bases = list(bases) # turn the string into a list
# (4) now you loop through your mutations and change them to N in the fasta list
for mut in mutdict[chromosome]:
bases[mut] = "N"
# (5) re-write the Fasta:
new_fasta = header
new_fasta = "\n".join("".join(bases[i:i + 50]) for i in xrange(len(bases)))
with open(newfile, "w") as out:
out.write(new_fasta)