我有一个有几千行的文件。格式基本上是:
text \t url1, url2, url3
现在,我有一个流程来获取每一行并在提到的网址中搜索内容。由于上传和获取内容需要一些时间,我想使用多处理来加快速度。
import gensim, re, string
import multiprocessing
import bz2
from multiprocessing import Pool
import mwparserfromhell, os, time
import codecs
import multiprocessing as mp
from boilerpipe.extract import Extractor
#from multiprocessing import freeze_support
import WikiExtractor as wikiextractor
from gensim.utils import to_unicode,any2unicode
#from lxml.etree import tounicode
from collections import defaultdict
from itertools import izip_longest
def read_file(tot):
file_to_read=os.getcwd()+"/../wikidata/sequence-url-mapper.txt"
f_r = open(file_to_read,'r')
for line in f_r:
content, urls=line.strip().split("||")
#print "Content =>", content
#print "URLs =>", urls
urls = urls.replace('set([u\'','').replace('\'])','').split(",")
#print urls
yield content, urls
f_r.close()
def aggregate_web_content(urls):
extracted_content=''
for url in urls:
try:
extractor = Extractor(extractor='ArticleExtractor', url=url)
retrieved_text = extractor.getText()
lines = retrieved_text.split('\n')
if len(lines) <= 5:
continue
for line in lines:
if len(line.strip()) > 10:
extracted_content=extracted_content+' '+line.replace('\s+',' ')
extracted_content = extracted_content +"||"
except:
continue
return extracted_content
if __name__ == "__main__":
pool = mp.Pool(processes=4)
result_list=[]
f_seq=codecs.open(os.getcwd()+'/../wikidata/sequence-content-mapper.txt','w')
k=0
for content, urllist in read_file(1):
content = content.replace('\s+',' ')
ext_content=aggregate_web_content(urllist).encode('utf-8','ignore')
if len(ext_content.strip()) > 10:
f_seq.write(content+"\t"+ext_content+'\n')
k+=1
#if k==10:
# break
#
if k>0 and k%10 == 0:
print "Done with ", k , " lines.."
if k == 10000:
break
f_seq.close()
我无法使用池在不同进程中单独处理行。