我试图通过打开抽象转储文件中的每个URL并使用BS4对其进行解析来解析mediawiki转储文件。 我大约有60万个URL,据我估计,这将需要200个小时。
sentenceTokens = []
with open('arwiki-latest-abstract.txt', newline='', encoding='utf-8') as textFile: # open text file
for line in textFile:
if '<url>' in line:
line = re.sub('<[^>]+>', '', line) # remove <> and anything within
line = re.sub('\n', '', line)
print(line)
requestURL = urllib.request.Request(line, headers={'User-Agent': 'Mozilla/5.0'}) # read webpage
try:
scrapeURL = urllib.request.urlopen(requestURL) # scrape webpage
except urllib.error.HTTPError as err:
continue
article = scrapeURL.read()
parsedArticle = bs.BeautifulSoup(article, 'lxml') # parse webpage
paragraphs = parsedArticle.find_all('p') # split article to paragraphs
textFromURL = ""
for paragraph in paragraphs:
textFromURL += paragraph.text # concat paragraphs
textFromURL = re.sub('[\[].*?[\]]', ' ', textFromURL) # remove [] and anything within
textFromURL = re.sub(r'\s+', ' ', textFromURL) # fix spaces
textFromURL = araby.strip_tashkeel(textFromURL)
sentenceTokens += nltk.sent_tokenize(textFromURL)
worddict, wordcount = vocab.build_dictionary(sentenceTokens)
vocab.save_dictionary(worddict, wordcount, 'D:\\Unsupervisedsummarization')
with open('listtext.txt', 'wb', encoding='utf-8') as strlist:
pickle.dump(sentenceTokens, strlist)
如果任何人都可以并行化此代码,或者提供一些有关如何使其运行更快的提示。 谢谢!
答案 0 :(得分:0)
正如评论中所述,您需要并行处理url请求和bs4解析,这是最慢的过程,如示例中所示:
代码
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from aiohttp import ClientSession, client_exceptions
from asyncio import Semaphore, ensure_future, gather, run
from json import dumps, loads
limit = 10
http_ok = [200]
async def scrape(url_list):
tasks = list()
sem = Semaphore(limit)
async with ClientSession() as session:
for url in url_list:
task = ensure_future(scrape_bounded(url, sem, session))
tasks.append(task)
result = await gather(*tasks)
return result
async def scrape_bounded(url, sem, session):
async with sem:
return await scrape_one(url, session)
async def scrape_one(url, session):
try:
async with session.get(url) as response:
content = await response.read()
except client_exceptions.ClientConnectorError:
print('Scraping %s failed due to the connection problem', url)
return False
if response.status not in http_ok:
print('Scraping%s failed due to the return code %s', url, response.status)
return False
content = loads(content.decode('UTF-8'))
return content
if __name__ == '__main__':
urls = ['http://demin.co:8080/echo1/', 'http://demin.co:8080/echo1/']
res = run(scrape(urls))
print(dumps(res, indent=4))
输出
[
{
"reply": "pong",
"url": "GET /",
"id": "echo1"
},
{
"reply": "pong",
"url": "GET /",
"id": "echo1"
}
]