我正在尝试创建一个抓取维基百科的网络抓取工具,并使用beautifulsoup和pymysql库将获取的数据(标题,内容和链接)存储在MySQL数据库中。在运行代码很多次后,我设法修复了导致我的程序终止的所有错误。现在我遇到的问题是MemoryError。知道怎么避免这个吗?在线的每个人都说你不能通过例外来处理这个错误,或者至少不是一个好的做法。有关此在线的所有解决方案都是针对具有第三方库的特定程序提供的,因此我没有任何意义。程序终止,2000循环后出现错误。
import socket
from urllib.error import HTTPError
from urllib.error import URLError
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
import re
#for recursion depth
from requests import Timeout, RequestException
import sys
sys.setrecursionlimit(80000)
pages=set()
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='somepsw', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute("USE searchenginedb2")
def store(title, content, link):
cur.execute("INSERT INTO websites (title, content, link) VALUES (\"%s\",\"%s\",\"%s\")", (title, content, link))
cur.connection.commit()
def getLinks(pageUrl):
global pages
try:
html = urlopen("http://en.wikipedia.org" + pageUrl, timeout=10)#timeout for timeouterror
except HTTPError:
print("HTTP ERROR")
except URLError:
print("URLERROR")
except socket.timeout:
print("SOCKERT TIMEOUT")
try:
bsObj = BeautifulSoup(html,"html.parser")
except UnboundLocalError:
print("UnboundLocalError")
try:
#print(bsObj.h1.get_text())
title = bsObj.find("h1").get_text()
content = bsObj.find("div", {"id": "mw-content-text"}).find("p").get_text()
#print(bsObj.find(id="mw-content-text").findAll("p")[0])
store(title, content,"http://en.wikipedia.org" + pageUrl)
#print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
except AttributeError:
print("Attribute Error: Page missing!")
except UnboundLocalError:
print("UnboundLocalError")
except HTTPError as err:
print("URL that generated the error code: ", pageUrl)
print("Error code:", err.code)
print("Error description:", err.reason)
except Timeout:
print("URL that generated the error code: ", pageUrl)
print("Error description: Session timed out.")
except ConnectionError:
print("URL that generated the error code: ", pageUrl)
print("Error description: Socket error timed out.")
except RequestException as e:
print (e)
#for avoiding crawling the same page twice.
try:
for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
# We have encountered a new page
newPage = link.attrs['href']
print("http://en.wikipedia.org"+ newPage)
pages.add(newPage)
getLinks(newPage)
except UnboundLocalError:
print("UnboundLocalError")
getLinks("")
ERROR:
Traceback (most recent call last):
File "C:/Users/paula/PycharmProjects/BookTests/StoringData/ownWikipedia.py", line 83, in getLinks
getLinks(newPage)
File "C:/Users/paula/PycharmProjects/BookTests/StoringData/ownWikipedia.py", line 83, in getLinks
getLinks(newPage)
File "C:/Users/paula/PycharmProjects/BookTests/StoringData/ownWikipedia.py", line 83, in getLinks
getLinks(newPage)
[Previous line repeated 984 more times]
File "C:/Users/paula/PycharmProjects/BookTests/StoringData/ownWikipedia.py", line 40, in getLinks
bsObj = BeautifulSoup(html,"html.parser")
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\__init__.py", line 225, in __init__
markup, from_encoding, exclude_encodings=exclude_encodings)):
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\builder\_htmlparser.py", line 205, in prepare_markup
exclude_encodings=exclude_encodings)
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\dammit.py", line 366, in __init__
for encoding in self.detector.encodings:
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\dammit.py", line 264, in encodings
self.chardet_encoding = chardet_dammit(self.markup)
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\dammit.py", line 34, in chardet_dammit
return chardet.detect(s)['encoding']
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\__init__.py", line 38, in detect
detector.feed(byte_str)
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\universaldetector.py", line 211, in feed
if prober.feed(byte_str) == ProbingState.FOUND_IT:
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\charsetgroupprober.py", line 71, in feed
state = prober.feed(byte_str)
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\hebrewprober.py", line 227, in feed
byte_str = self.filter_high_byte_only(byte_str)
File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\charsetprober.py", line 63, in filter_high_byte_only
buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
File "C:\Users\paula\AppData\Local\Programs\Python\Python36-32\lib\re.py", line 191, in sub
return _compile(pattern, flags).sub(repl, string, count)
MemoryError
Process finished with exit code 1