Question

我正在尝试创建一个抓取维基百科的网络抓取工具，并使用beautifulsoup和pymysql库将获取的数据（标题，内容和链接）存储在MySQL数据库中。在运行代码很多次后，我设法修复了导致我的程序终止的所有错误。现在我遇到的问题是MemoryError。知道怎么避免这个吗？在线的每个人都说你不能通过例外来处理这个错误，或者至少不是一个好的做法。有关此在线的所有解决方案都是针对具有第三方库的特定程序提供的，因此我没有任何意义。程序终止，2000循环后出现错误。

import socket
from urllib.error import HTTPError
from urllib.error import URLError
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
import re


#for recursion depth
from requests import Timeout, RequestException
import sys

sys.setrecursionlimit(80000)
pages=set()
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='somepsw', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute("USE searchenginedb2")


def store(title, content, link):
    cur.execute("INSERT INTO websites (title, content, link) VALUES (\"%s\",\"%s\",\"%s\")", (title, content, link))
    cur.connection.commit()

def getLinks(pageUrl):
    global pages
    try:
        html = urlopen("http://en.wikipedia.org" + pageUrl, timeout=10)#timeout for timeouterror
    except HTTPError:
        print("HTTP ERROR")
    except URLError:
        print("URLERROR")
    except socket.timeout:
        print("SOCKERT TIMEOUT")


    try:
        bsObj = BeautifulSoup(html,"html.parser")
    except UnboundLocalError:
        print("UnboundLocalError")


    try:
        #print(bsObj.h1.get_text())
        title = bsObj.find("h1").get_text()

        content = bsObj.find("div", {"id": "mw-content-text"}).find("p").get_text()
        #print(bsObj.find(id="mw-content-text").findAll("p")[0])
        store(title, content,"http://en.wikipedia.org" + pageUrl)
        #print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
    except AttributeError:
        print("Attribute Error: Page missing!")
    except UnboundLocalError:
        print("UnboundLocalError")

    except HTTPError as err:
        print("URL that generated the error code: ", pageUrl)
        print("Error code:", err.code)
        print("Error description:", err.reason)
    except Timeout:
        print("URL that generated the error code: ", pageUrl)
        print("Error description: Session timed out.")
    except ConnectionError:
        print("URL that generated the error code: ", pageUrl)
        print("Error description: Socket error timed out.")
    except RequestException as e:
        print (e)



#for avoiding crawling the same page twice.
    try:

        for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
            if 'href' in link.attrs:
                if link.attrs['href'] not in pages:
                    # We have encountered a new page
                    newPage = link.attrs['href']
                    print("http://en.wikipedia.org"+ newPage)
                    pages.add(newPage)
                    getLinks(newPage)
    except UnboundLocalError:
        print("UnboundLocalError")


getLinks("")

ERROR：

Traceback (most recent call last):
  File "C:/Users/paula/PycharmProjects/BookTests/StoringData/ownWikipedia.py", line 83, in getLinks
    getLinks(newPage)
  File "C:/Users/paula/PycharmProjects/BookTests/StoringData/ownWikipedia.py", line 83, in getLinks
    getLinks(newPage)
  File "C:/Users/paula/PycharmProjects/BookTests/StoringData/ownWikipedia.py", line 83, in getLinks
    getLinks(newPage)
  [Previous line repeated 984 more times]
  File "C:/Users/paula/PycharmProjects/BookTests/StoringData/ownWikipedia.py", line 40, in getLinks
    bsObj = BeautifulSoup(html,"html.parser")
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\__init__.py", line 225, in __init__
    markup, from_encoding, exclude_encodings=exclude_encodings)):
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\builder\_htmlparser.py", line 205, in prepare_markup
    exclude_encodings=exclude_encodings)
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\dammit.py", line 366, in __init__
    for encoding in self.detector.encodings:
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\dammit.py", line 264, in encodings
    self.chardet_encoding = chardet_dammit(self.markup)
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\bs4\dammit.py", line 34, in chardet_dammit
    return chardet.detect(s)['encoding']
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\__init__.py", line 38, in detect
    detector.feed(byte_str)
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\universaldetector.py", line 211, in feed
    if prober.feed(byte_str) == ProbingState.FOUND_IT:
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\charsetgroupprober.py", line 71, in feed
    state = prober.feed(byte_str)
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\hebrewprober.py", line 227, in feed
    byte_str = self.filter_high_byte_only(byte_str)
  File "C:\Users\paula\PycharmProjects\BookTests\venv\lib\site-packages\chardet\charsetprober.py", line 63, in filter_high_byte_only
    buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
  File "C:\Users\paula\AppData\Local\Programs\Python\Python36-32\lib\re.py", line 191, in sub
    return _compile(pattern, flags).sub(repl, string, count)
MemoryError

Process finished with exit code 1

在我的Python代码中处理MemoryError

0 个答案: