Question

我刚刚开始学习python和编程，所以这可能是一个非常幼稚的问题。但我会感激任何帮助。

以下代码有效，但我已经被告知有这些多个输入和输出是坏的，我应该替代嵌套循环。但是，每当我尝试嵌套任何东西时，我都会尝试它，最后它会给我一个空文件夹。

所以我的问题是如何嵌套所有这些？

非常感谢和抱歉。

 #1) I call a perl script and execute it to get the input file.
perl = "/usr/bin/perl"
perl_script = "geoFF.pl";
params = " --mount-doom-hot"
pl_script = subprocess.Popen([perl, perl_script, params], stdout=sys.stdout)
pl_script.communicate()

## 2) input the output from the perl script but only the wanted data.
# The input is a BIG file and I just want some specific lines from it.
infile1 = "inputperl.txt"  
outfile1 = "c1.txt"   

f1 = open(infile1,'rU')
o1 = open(outfile1,'w+')

words = ['Acc','title','orgn','date','GP'] #for lines in file f1 get lines with the words

for line in f1:
    if any(words in line for words in words):
        o1.write(line)

# From the specific lines delete some symbols/charactewords I don't want.   

input1 =open("c1.txt",'rU')   
output1 = open("c2.txt",'w')
del_list = ['>','title', 'orgn','date','<','GP','/Item','"','</Item>','<DS>','Name=','DocS','Acc'] # I want to keep the rest of the line but not these words.

for line in input1:
    for word in del_list:
         line = line.replace(word, "")
    output1.write(line)

# For one specific word in the lines AB. The file has lines with AB129, AB8877, AB0997 and AB(etc). Here I want to attach and url so it will be an hyperlink.Attached url to GSE to get hyperlink
inp = open("c2.txt",'rU')
out= open("c3.txt",'w')
filedata2 = inp.read()
newdata2 = filedata2.replace('AB', "\n"'http://www.whatever.com/g/qu/acc.cgi?acc=AB')
out.write(newdata2)
# this output the line as http://www.whatever.com/g/qu/acc.cgi?acc=AB(somenumber)
#for example http://www.whatever.com/g/qu/acc.cgi?acc=AB129
#and http://www.whatever.com/g/qu/acc.cgi?acc=AB8877 etc.

### then I want to take this files with the changes and send it by email
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText

fromaddr = "sender@gmail.com"
toaddr = "receiver@gmail.com"
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['To'] = toaddr
msg['Subject'] = "RESULT"

# send txt file in email body
f6 = (open("c3.txt",'rU'))
results = MIMEText(f6.read(),'plain') 
f6.close()
msg.attach(results)

#convert to string
import smtplib
server = smtplib.SMTP('smtp.gmail.com', 587)
server.ehlo()
server.starttls()
server.ehlo()
server.login("sender email", "password")
text = msg.as_string()
server.sendmail(fromaddr, toaddr, text)

输入文件看起来像

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE>
<eSummaryResult>
<DS>
    <Id>20006767</Id>
    <Item Name="Acc" Type="String">AB64767</Item>
    <Item Name="GDS" Type="String"></Item>
    <Item Name="title" Type="String">word word title of this word...</Item>
    <Item Name="summary" Type="String">word word word..word word word..</Item>
    <Item Name="GP" Type="String">11002;13112</Item>
    <Item Name="AB" Type="String">64767</Item>
    <Item Name="orgn" Type="String">Mus musculus</Item>
    <Item Name="entryType" Type="String">AB</Item>
    <Item Name="gdsType" Type="String">word word word..word word word..word word word..</Item>
    <Item Name="ptechType" Type="String"></Item>
    <Item Name="valType" Type="String"></Item>
    <Item Name="SSInfo" Type="String"></Item>
    <Item Name="subsetInfo" Type="String"></Item>
    <Item Name="date" Type="String">2015/12/09</Item>
    <Item Name="suppFile" Type="String">WIG</Item>
    <Item Name="Samples" Type="List">   
    </Item>
    <Item Name="n_samples" Type="Integer">12</Item>
    <Item Name="SeriesTitle" Type="String"></Item>
    <Item Name="PlatformTitle" Type="String"></Item>
    <Item Name="PlatformTaxa" Type="String"></Item>
    <Item Name="SamplesTaxa" Type="String"></Item>
    <Item Name="Ids" Type="List">
</Item>
    <Id>200098567</Id>
    <Item Name="Acc" Type="String">AB64789</Item>
    <Item Name="GDS" Type="String"></Item>
    <Item Name="title" Type="String">word word word...</Item>
    <Item Name="summary" Type="String">word word word..word word word..</Item>
    <Item Name="GP" Type="String">11002;13112</Item>
    <Item Name="AB" Type="String">AB64789</Item>
    <Item Name="orgn" Type="String">Mus musculus</Item>
    <Item Name="entryType" Type="String">AB</Item>
    <Item Name="gdsType" Type="String">word word word..word word word..word word word..</Item>
    <Item Name="ptechType" Type="String"></Item>
    <Item Name="valType" Type="String"></Item>
    <Item Name="SSInfo" Type="String"></Item>
    <Item Name="subsetInfo" Type="String"></Item>
    <Item Name="date" Type="String">2015/12/09</Item>
    <Item Name="suppFile" Type="String">WIG</Item>
    <Item Name="Samples" Type="List">
</Item>
  </Item>       
    <Id>200064997</Id>
    <Item Name="Acc" Type="String">AB69957</Item>
    <Item Name="GDS" Type="String"></Item>
    <Item Name="title" Type="String">word word word...</Item>
    <Item Name="summary" Type="String">word word word..word word word..</Item>
    <Item Name="GP" Type="String">1100</Item>
    <Item Name="AB" Type="String">69957</Item>
    <Item Name="orgn" Type="String">Mus musculus</Item>
    <Item Name="entryType" Type="String">AB</Item>
    <Item Name="gdsType" Type="String">word word word..word word word..word word word..</Item>
    <Item Name="ptechType" Type="String"></Item>
    <Item Name="valType" Type="String"></Item>
    <Item Name="SSInfo" Type="String"></Item>
    <Item Name="subsetInfo" Type="String"></Item>
    <Item Name="date" Type="String">2015/12/09</Item>
    <Item Name="suppFile" Type="String">WIG</Item>
    <Item Name="Samples" Type="List">   
    </Item>
    <Item Name="n_samples" Type="Integer">12</Item>
    <Item Name="SeriesTitle" Type="String"></Item>
    <Item Name="PlatformTitle" Type="String"></Item>
    <Item Name="PlatformTaxa" Type="String"></Item>
    <Item Name="SamplesTaxa" Type="String"></Item>
    <Item Name="Ids" Type="List">
    <Item Name="int" Type="Integer">26476451</Item>
    </Item>
    <Item Name="Projects" Type="List"></Item>
    <Item Name="G2R" Type="String">no</Item>

我只想要以下数据：

<Item Name="Acc" Type="String">AB64767</Item>
<Item Name="title" Type="String">word word title of this word...</Item>
<Item Name="AB" Type="String">64767</Item>
<Item Name="orgn" Type="String">Mus musculus</Item>
<Item Name="date" Type="String">2015/12/09</Item>

但显示为：

http://www.whatever.com/g/qu/acc.cgi?acc=AB64767
word word title of this word...
Mus musculus
2015/12/09

http://www.whatever.com/g/qu/acc.cgi?acc=AB64789
word word title of this word...
Mus musculus
2015/12/09

http://www.whatever.com/g/qu/acc.cgi?acc=AB69957
word word title of this word...
Mus musculus
2015/12/09

Answer 1

虽然这还远没有完成，但这里有一些指示：

磁盘IO很慢，所以如果您只读一次，请执行所有处理，然后生成输出，而不是通过每个过滤步骤的文件来获得更好的性能。

例如，让我们解释一下：

for line in f1:
    if any(words in line for words in words):
        o1.write(line)

# From the specific lines delete some symbols/charactewords I don't want.   

input1 =open("c1.txt",'rU')   
output1 = open("c2.txt",'w')
del_list = ['>','title', 'orgn','date','<','GP','/Item','"','</Item>','<DS>','Name=','DocS','Acc'] # I want to keep the rest of the line but not these words.

for line in input1:
    for word in del_list:
         line = line.replace(word, "")
    output1.write(line)

在第一个循环中，您只从输入文件中选择几行。在第二个循环中，您将从所选行中删除一些单词。在您之间将整个数据写入磁盘。

一个相当简单的优化是在写回磁盘之前直接进行单词替换，即：

del_list = ['>','title', 'orgn','date','<','GP','/Item','"','</Item>','<DS>','Name=','DocS','Acc'] 

for line in f1:
    if any(words in line for words in words):
        for word in del_list:
            line = line.replace(word, "")
        o1.write(line)

你能看到这如何节省往返磁盘的往返次数吗？替代技术是通过将文件读入list然后在该列表上操作而不是每次来回磁盘来将数据保存在内存中。

我希望这能指出正确的方法，你现在可以弄清楚如何摆脱第三组文件，这样你最终只得到一个输入文件和一个输出文件。

Answer 2

读取文件一次并使用正则表达式将是一种更好的方法：

import re
del_list = ['>', 'title', 'orgn', 'date', '<', 'GP', '/Item', '"', '</Item>', '<DS>', 'Name=', 'DocS',
            'Acc']  # I want to keep the rest of the line but not these words.
words = ['Acc', 'title', 'orgn', 'date', 'GP'] 


rep = re.compile(r'|'.join(del_list))
keep = re.compile(r"|".join(words))
r3 = re.compile("AB(?=\d)")

with open("test.txt") as f, open("out.txt","w") as out:
    for line in f:
         # if line contains match from words
        if keep.search(line):
            # replace all unwanted substrings
            line = rep.sub("", line.lstrip())
            line = r3.sub("\n"'http://www.whatever.com/g/qu/acc.cgi?acc=AB', line)
            out.write(line)

out.txt：

Item  Type=String
http://www.whatever.com/g/qu/acc.cgi?acc=AB64767
Item  Type=Stringword word  of this word...
Item  Type=String11002;13112
Item  Type=StringMus musculus
Item  Type=String2015/12/09
Item  Type=String
http://www.whatever.com/g/qu/acc.cgi?acc=AB64789
Item  Type=Stringword word word...
Item  Type=String11002;13112
Item  Type=StringMus musculus
Item  Type=String2015/12/09
Item  Type=String
http://www.whatever.com/g/qu/acc.cgi?acc=AB69957
Item  Type=Stringword word word...
Item  Type=String1100
Item  Type=StringMus musculus
Item  Type=String2015/12/09

如果您希望完全匹配某些单词，则需要在正则表达式中使用单词边界，否则您将最终匹配"foo" in "foobar"，如果您要做的只是发送您不需要的文件。我必须把它写到磁盘上。

Python：嵌套循环而不是创建多个输入和输出

2 个答案: