我刚刚开始学习python和编程,所以这可能是一个非常幼稚的问题。但我会感激任何帮助。
以下代码有效,但我已经被告知有这些多个输入和输出是坏的,我应该替代嵌套循环。但是,每当我尝试嵌套任何东西时,我都会尝试它,最后它会给我一个空文件夹。
所以我的问题是如何嵌套所有这些?
非常感谢和抱歉。
#1) I call a perl script and execute it to get the input file.
perl = "/usr/bin/perl"
perl_script = "geoFF.pl";
params = " --mount-doom-hot"
pl_script = subprocess.Popen([perl, perl_script, params], stdout=sys.stdout)
pl_script.communicate()
## 2) input the output from the perl script but only the wanted data.
# The input is a BIG file and I just want some specific lines from it.
infile1 = "inputperl.txt"
outfile1 = "c1.txt"
f1 = open(infile1,'rU')
o1 = open(outfile1,'w+')
words = ['Acc','title','orgn','date','GP'] #for lines in file f1 get lines with the words
for line in f1:
if any(words in line for words in words):
o1.write(line)
# From the specific lines delete some symbols/charactewords I don't want.
input1 =open("c1.txt",'rU')
output1 = open("c2.txt",'w')
del_list = ['>','title', 'orgn','date','<','GP','/Item','"','</Item>','<DS>','Name=','DocS','Acc'] # I want to keep the rest of the line but not these words.
for line in input1:
for word in del_list:
line = line.replace(word, "")
output1.write(line)
# For one specific word in the lines AB. The file has lines with AB129, AB8877, AB0997 and AB(etc). Here I want to attach and url so it will be an hyperlink.Attached url to GSE to get hyperlink
inp = open("c2.txt",'rU')
out= open("c3.txt",'w')
filedata2 = inp.read()
newdata2 = filedata2.replace('AB', "\n"'http://www.whatever.com/g/qu/acc.cgi?acc=AB')
out.write(newdata2)
# this output the line as http://www.whatever.com/g/qu/acc.cgi?acc=AB(somenumber)
#for example http://www.whatever.com/g/qu/acc.cgi?acc=AB129
#and http://www.whatever.com/g/qu/acc.cgi?acc=AB8877 etc.
### then I want to take this files with the changes and send it by email
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
fromaddr = "sender@gmail.com"
toaddr = "receiver@gmail.com"
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['To'] = toaddr
msg['Subject'] = "RESULT"
# send txt file in email body
f6 = (open("c3.txt",'rU'))
results = MIMEText(f6.read(),'plain')
f6.close()
msg.attach(results)
#convert to string
import smtplib
server = smtplib.SMTP('smtp.gmail.com', 587)
server.ehlo()
server.starttls()
server.ehlo()
server.login("sender email", "password")
text = msg.as_string()
server.sendmail(fromaddr, toaddr, text)
输入文件看起来像
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE>
<eSummaryResult>
<DS>
<Id>20006767</Id>
<Item Name="Acc" Type="String">AB64767</Item>
<Item Name="GDS" Type="String"></Item>
<Item Name="title" Type="String">word word title of this word...</Item>
<Item Name="summary" Type="String">word word word..word word word..</Item>
<Item Name="GP" Type="String">11002;13112</Item>
<Item Name="AB" Type="String">64767</Item>
<Item Name="orgn" Type="String">Mus musculus</Item>
<Item Name="entryType" Type="String">AB</Item>
<Item Name="gdsType" Type="String">word word word..word word word..word word word..</Item>
<Item Name="ptechType" Type="String"></Item>
<Item Name="valType" Type="String"></Item>
<Item Name="SSInfo" Type="String"></Item>
<Item Name="subsetInfo" Type="String"></Item>
<Item Name="date" Type="String">2015/12/09</Item>
<Item Name="suppFile" Type="String">WIG</Item>
<Item Name="Samples" Type="List">
</Item>
<Item Name="n_samples" Type="Integer">12</Item>
<Item Name="SeriesTitle" Type="String"></Item>
<Item Name="PlatformTitle" Type="String"></Item>
<Item Name="PlatformTaxa" Type="String"></Item>
<Item Name="SamplesTaxa" Type="String"></Item>
<Item Name="Ids" Type="List">
</Item>
<Id>200098567</Id>
<Item Name="Acc" Type="String">AB64789</Item>
<Item Name="GDS" Type="String"></Item>
<Item Name="title" Type="String">word word word...</Item>
<Item Name="summary" Type="String">word word word..word word word..</Item>
<Item Name="GP" Type="String">11002;13112</Item>
<Item Name="AB" Type="String">AB64789</Item>
<Item Name="orgn" Type="String">Mus musculus</Item>
<Item Name="entryType" Type="String">AB</Item>
<Item Name="gdsType" Type="String">word word word..word word word..word word word..</Item>
<Item Name="ptechType" Type="String"></Item>
<Item Name="valType" Type="String"></Item>
<Item Name="SSInfo" Type="String"></Item>
<Item Name="subsetInfo" Type="String"></Item>
<Item Name="date" Type="String">2015/12/09</Item>
<Item Name="suppFile" Type="String">WIG</Item>
<Item Name="Samples" Type="List">
</Item>
</Item>
<Id>200064997</Id>
<Item Name="Acc" Type="String">AB69957</Item>
<Item Name="GDS" Type="String"></Item>
<Item Name="title" Type="String">word word word...</Item>
<Item Name="summary" Type="String">word word word..word word word..</Item>
<Item Name="GP" Type="String">1100</Item>
<Item Name="AB" Type="String">69957</Item>
<Item Name="orgn" Type="String">Mus musculus</Item>
<Item Name="entryType" Type="String">AB</Item>
<Item Name="gdsType" Type="String">word word word..word word word..word word word..</Item>
<Item Name="ptechType" Type="String"></Item>
<Item Name="valType" Type="String"></Item>
<Item Name="SSInfo" Type="String"></Item>
<Item Name="subsetInfo" Type="String"></Item>
<Item Name="date" Type="String">2015/12/09</Item>
<Item Name="suppFile" Type="String">WIG</Item>
<Item Name="Samples" Type="List">
</Item>
<Item Name="n_samples" Type="Integer">12</Item>
<Item Name="SeriesTitle" Type="String"></Item>
<Item Name="PlatformTitle" Type="String"></Item>
<Item Name="PlatformTaxa" Type="String"></Item>
<Item Name="SamplesTaxa" Type="String"></Item>
<Item Name="Ids" Type="List">
<Item Name="int" Type="Integer">26476451</Item>
</Item>
<Item Name="Projects" Type="List"></Item>
<Item Name="G2R" Type="String">no</Item>
我只想要以下数据:
<Item Name="Acc" Type="String">AB64767</Item>
<Item Name="title" Type="String">word word title of this word...</Item>
<Item Name="AB" Type="String">64767</Item>
<Item Name="orgn" Type="String">Mus musculus</Item>
<Item Name="date" Type="String">2015/12/09</Item>
但显示为:
http://www.whatever.com/g/qu/acc.cgi?acc=AB64767
word word title of this word...
Mus musculus
2015/12/09
http://www.whatever.com/g/qu/acc.cgi?acc=AB64789
word word title of this word...
Mus musculus
2015/12/09
http://www.whatever.com/g/qu/acc.cgi?acc=AB69957
word word title of this word...
Mus musculus
2015/12/09
答案 0 :(得分:1)
虽然这还远没有完成,但这里有一些指示:
磁盘IO很慢,所以如果您只读一次,请执行所有处理,然后生成输出,而不是通过每个过滤步骤的文件来获得更好的性能。
例如,让我们解释一下:
for line in f1:
if any(words in line for words in words):
o1.write(line)
# From the specific lines delete some symbols/charactewords I don't want.
input1 =open("c1.txt",'rU')
output1 = open("c2.txt",'w')
del_list = ['>','title', 'orgn','date','<','GP','/Item','"','</Item>','<DS>','Name=','DocS','Acc'] # I want to keep the rest of the line but not these words.
for line in input1:
for word in del_list:
line = line.replace(word, "")
output1.write(line)
在第一个循环中,您只从输入文件中选择几行。 在第二个循环中,您将从所选行中删除一些单词。在您之间将整个数据写入磁盘。
一个相当简单的优化是在写回磁盘之前直接进行单词替换,即:
del_list = ['>','title', 'orgn','date','<','GP','/Item','"','</Item>','<DS>','Name=','DocS','Acc']
for line in f1:
if any(words in line for words in words):
for word in del_list:
line = line.replace(word, "")
o1.write(line)
你能看到这如何节省往返磁盘的往返次数吗?替代技术是通过将文件读入list
然后在该列表上操作而不是每次来回磁盘来将数据保存在内存中。
我希望这能指出正确的方法,你现在可以弄清楚如何摆脱第三组文件,这样你最终只得到一个输入文件和一个输出文件。
答案 1 :(得分:1)
读取文件一次并使用正则表达式将是一种更好的方法:
import re
del_list = ['>', 'title', 'orgn', 'date', '<', 'GP', '/Item', '"', '</Item>', '<DS>', 'Name=', 'DocS',
'Acc'] # I want to keep the rest of the line but not these words.
words = ['Acc', 'title', 'orgn', 'date', 'GP']
rep = re.compile(r'|'.join(del_list))
keep = re.compile(r"|".join(words))
r3 = re.compile("AB(?=\d)")
with open("test.txt") as f, open("out.txt","w") as out:
for line in f:
# if line contains match from words
if keep.search(line):
# replace all unwanted substrings
line = rep.sub("", line.lstrip())
line = r3.sub("\n"'http://www.whatever.com/g/qu/acc.cgi?acc=AB', line)
out.write(line)
out.txt:
Item Type=String
http://www.whatever.com/g/qu/acc.cgi?acc=AB64767
Item Type=Stringword word of this word...
Item Type=String11002;13112
Item Type=StringMus musculus
Item Type=String2015/12/09
Item Type=String
http://www.whatever.com/g/qu/acc.cgi?acc=AB64789
Item Type=Stringword word word...
Item Type=String11002;13112
Item Type=StringMus musculus
Item Type=String2015/12/09
Item Type=String
http://www.whatever.com/g/qu/acc.cgi?acc=AB69957
Item Type=Stringword word word...
Item Type=String1100
Item Type=StringMus musculus
Item Type=String2015/12/09
如果您希望完全匹配某些单词,则需要在正则表达式中使用单词边界,否则您将最终匹配"foo" in "foobar"
,如果您要做的只是发送您不需要的文件。我必须把它写到磁盘上。