在尝试确定是否应该使用并行编程时,我一直坚持使用这段代码。
代码采用包含两列的文本文件:第一列包含一个单词,第二列包含一个URL。
在String_stripper_function()中,文本文件的每一行都以特定方式格式化(因此对replace()函数的所有调用。)。
然后我们在第一列和第二列之间进行比较,如果第一列中的单词包含在第二列的url中,那么该行将被写入一个新文件(称之为Result.txt
)
此外,如果第一列中的单词包含4个大写字母,而第二列中的URL包含数字,则将该行添加到同一个新文件(Result.txt
)。
现在这种情况有效,我已经多次检查,但是在具有16GB内存的 i7 计算机上需要花费很长时间,<100>行 几小时。
该文件包含1923014行(如果您愿意,则为行);它是97.9 MB。
所以我的问题是:表现明智我的代码有什么问题?
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 16:44:35 2015
@author: Steve
"""
import re
import multiprocessing as mp
import numpy as np
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
#This code strips the urls into their main domain
def url_stripper(url):
url=url.replace('http://','')
url=url.replace('https://','')
url=url.replace('http','')
url=url.replace('/',' ')
url=url.split()
return url
def String_stripper_function():
with open("homepages.txt") as infile:
i=0
l1=np.array([])
l2=np.array([])
l3=np.array([])
l4=np.array([])
for line in infile:
word_original=line.split()[0]
url_original=line.split()[1]
url=url_stripper(url_original)
if len(url)==0:
print 'lol no url fam'
else:
url=url[0]
word=word_original.replace('_',' ')
word=word.replace('-',' ')
word=word.replace('(','')
word=word.replace(')','')
regex = re.compile(".*?\((.*?)\)")
word_in_parenthesis = re.findall(regex, word)
for i in xrange(len(word_in_parenthesis)):
word=word.replace(word_in_parenthesis[i],'')
word=word.replace('The ','')
word=word.replace(' The ','')
word=word.replace(', The ','')
word=word.replace(' ,The ','')
word=word.replace(',The ','')
word=word.replace('...','')
word=word.replace('A ','')
word=word.replace(' A ','')
word=word.replace(', A ','')
word=word.replace(' ,A ','')
word=word.replace(',A ','')
word=word.replace('An ','')
word=word.replace(' An ','')
word=word.replace(', An ','')
word=word.replace(' ,An ','')
word=word.replace(',An ','')
word=word.replace(',','')
#condition 2&3
words=word.split()
# print word.lower().split()
# print url_original.lower()
Capital_Letters=sum(1 for c in word if c.isupper())
decision=hasNumbers(url)
for w in words:
#comment the following for
if w.lower() in url_original.lower():
if word_original not in l1:
l1=np.append(l1,word_original)
l2=np.append(l2,url_original)
else:
print ""
#Uncomment the following for Domain only
# if w.lower() in url.lower():
# l1=np.append(l1,word_original)
# l2=np.append(l2,url_original)
elif Capital_Letters==4 and decision==True:
if word_original not in l1:
l1=np.append(l1,word_original)
l2=np.append(l2,url_original)
else:
print ""
# if word_original not in l1:
# if word_original not in l3:
# l3=np.append(l3,word_original)
# l4=np.append(l4,url_original)
else:
print ""
file = open("results.txt", "w")
for index in xrange(len(l1)):
file.write( '%s \t %s\n' % (str(l1[index]),str(l2[index])))
file.close()
# file1 = open("results_failedConditions.txt", "w")
# for index in xrange(len(l3)):
# file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index])))
# file1.close()
if __name__=="__main__":
String_stripper_function()
答案 0 :(得分:0)
首先,这个问题应该在Code Review上提出。
我对您的代码进行了一些更改,以考虑一些假设。
str.replace()
通过在一个大字符串上运行而不是逐行迭代来运行得更快。urlStripper
我刚刚找到://
,因为我相信开始时间才会发生。.replace("/"," ")
然后使用.split()
,而.split("/")
应该更快。您可以使用自己的分隔符进行拆分。请参阅文档中的split。我做了一些其他的小改动。在我的测试中,我的版本运行500次1000次测试,这个版本需要0.054秒,而你的版本需要0.133秒。 这是代码:
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 16:44:35 2015
@author: Steve
@edit: IronManMark20
"""
import timer
import re
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
#This code strips the urls into their main domain
def url_stripper(url):
try:
index=url.index("://") #only happens at beginning
except:
return url.split("/") #you can set the splitter
url=url[:index]
return url.split("/")
def String_stripper_function():
with open("./homepages.txt") as infile:
i=0
l1=[]
l2=[]
#l3=[]
#l4=[]
lines_string="" #We'll use this later
uris=[]#needed for uris
for line in infile:
word_original=line.split()[0]
url_original=line.split()[1]
url=url_stripper(url_original)
if len(url)==0:
print 'lol no url fam'
else:
url=url[0]
lines_string+=word_original
lines_string+="/" #add a delimiter that we don't search for later
uris+=[url_original]
words=Mass_List(lines_string)
words=words[:len(words)-1]
for w in words:
lines=lines_string.split("/")#split for later use
Capital_Letters=sum(1 for c in w if c.isupper())
url_original=uris[words.index(w)] #get url for each line
decision=hasNumbers(url_original)
#comment the following for
if w.lower() in url_original.lower():
if word_original not in l1:
l1+=[lines[words.index(w)]]
l2+=[uris[words.index(w)]]
# else:
# print ""
#Uncomment the following for Domain only
# if w.lower() in url.lower():
# l1=np.append(l1,word_original)
# l2=np.append(l2,url_original)
elif Capital_Letters==4 and decision==True:
if word_original not in l1:
l1+=[lines[words.index(w)]]
l2+=[uris[words.index(w)]]
# else:
# print ""
# if word_original not in l1:
# if word_original not in l3:
# l3=np.append(l3,word_original)
# l4=np.append(l4,url_original)
#else:
# print ""
file = open("results.txt", "w")
for i in range(len(l1)):
file.write(l1[i]+" "+l2[i]+"\n")
file.close()
# file1 = open("results_failedConditions.txt", "w")
# for index in xrange(len(l3)):
# file1.write( '%s \t %s\n' % (str(l3[index]),str(l4[index])))
# file1.close()
def Mass_List(lines):
word=lines.replace('_',' ')
word=word.replace('-',' ')
word=word.replace('(','')
word=word.replace(')','')
regex = re.compile(".*?\((.*?)\)")
word_in_parenthesis = re.findall(regex, word)
for i in xrange(len(word_in_parenthesis)):
word=word.replace(word_in_parenthesis[i],'')
word=word.replace('The ','')
word=word.replace(' The ','')
word=word.replace(', The ','')
word=word.replace(' ,The ','')
word=word.replace(',The ','')
word=word.replace('...','')
word=word.replace('A ','')
word=word.replace(' A ','')
word=word.replace(', A ','')
word=word.replace(' ,A ','')
word=word.replace(',A ','')
word=word.replace('An ','')
word=word.replace(' An ','')
word=word.replace(', An ','')
word=word.replace(' ,An ','')
word=word.replace(',An ','')
word=word.replace(',','')
words=word.split('/') #changed to split in arbitrary delimiter
return words
if __name__=="__main__":
String_stripper_function()