需要协助调试Python Web爬网程序

时间:2014-01-24 10:36:55

标签: python beautifulsoup web-crawler

尽管我在过去几个小时内付出了最大的努力,但我无法运行爬虫(名为searchengine.py)。它似乎无法成功索引页面。我会给你完整的爬虫代码。我收到的那种错误如下所示

Indexing http://www.4futureengineers.com/company.html
Could not parse page http://www.4futureengineers.com/company.html

我通过在Python交互式会话(shell)中输入以下命令来调用searchengine.py

>> import searchengine
>> crawler=searchengine.crawler('searchindex.db')
>> pages= \
.. ['http://www.4futureengineers.com/company.html']
>> crawler.crawl(pages)

它给出了错误,即在命令crawler.crawl(pages)

之后立即解析不成功

以下是searchengine.py

的完整源代码
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
from pysqlite2 import dbapi2 as sqlite


# Create a list of words to ignore
ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1}


class crawler:
  # Initialize the crawler with the name of database
  def __init__(self,dbname):
    self.con=sqlite.connect(dbname)

  def __del__(self):
    self.con.close()

  def dbcommit(self):
    self.con.commit()


  # Auxilliary function for getting an entry id and adding 
  # it if it's not present
  def getentryid(self,table,field,value,createnew=True):
    cur=self.con.execute(
    "select rowid from %s where %s='%s'" % (table,field,value))
    res=cur.fetchone()
    if res==None:
      cur=self.con.execute(
      "insert into %s (%s) values ('%s')" % (table,field,value))
      return cur.lastrowid
    else:
      return res[0]


  # Index an individual page
  def addtoindex(self,url,soup):
    if self.isindexed(url): return
    print 'Indexing '+url

    # Get the individual words
    text=self.gettextonly(soup)
    words=self.separatewords(text)

    # Get the URL id
    urlid=self.getentryid('urllist','url',url)

    # Link each word to this url
    for i in range(len(words)):
      word=words[i]
      if word in ignorewords: continue
      wordid=self.getentryid('wordlist','word',word)
      self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i))


  # Extract the text from an HTML page (no tags)
  def gettextonly(self,soup):
    v=soup.string
    if v==Null:   
      c=soup.contents
      resulttext=''
      for t in c:
        subtext=self.gettextonly(t)
        resulttext+=subtext+'\n'
      return resulttext
    else:
      return v.strip()

  # Seperate the words by any non-whitespace character
  def separatewords(self,text):
    splitter=re.compile('\\W*')
    return [s.lower() for s in splitter.split(text) if s!='']



  def isindexed(self,url):
    u=self.con.execute \
      ("select rowid from urllist where url='%s'" % url).fetchone()
    if u!=None:
      #Check if it has actually been crawled
      v=self.con.execute(
      'select * from wordlocation where urlid=%d' % u[0]).fetchone()
      if v!=None: return True
    return False



  def crawl(self,pages,depth=2):
    for i in range(depth):
      newpages={}
      for page in pages:
        try:
          c=urllib2.urlopen(page)
        except:
          print "Could not open %s" % page
          continue

        try:
          soup=BeautifulSoup(c.read())
          self.addtoindex(page,soup)

          links=soup('a')
          for link in links:
            if ('href' in dict(link.attrs)):
              url=urljoin(page,link['href'])
              if url.find("'")!=-1: continue
              url=url.split('#')[0]  # remove location portion
              if url[0:4]=='http' and not self.isindexed(url):
                newpages[url]=1
              linkText=self.gettextonly(link)
              self.addlinkref(page,url,linkText)

          self.dbcommit()
        except:
          print "Could not parse page %s" % page


      pages=newpages



  # Create the database tables
  def createindextables(self): 
    self.con.execute('create table urllist(url)')
    self.con.execute('create table wordlist(word)')
    self.con.execute('create table wordlocation(urlid,wordid,location)')
    self.con.execute('create table link(fromid integer,toid integer)')
    self.con.execute('create table linkwords(wordid,linkid)')
    self.con.execute('create index wordidx on wordlist(word)')
    self.con.execute('create index urlidx on urllist(url)')
    self.con.execute('create index wordurlidx on wordlocation(wordid)')
    self.con.execute('create index urltoidx on link(toid)')
    self.con.execute('create index urlfromidx on link(fromid)')
    self.dbcommit()

1 个答案:

答案 0 :(得分:1)

crawl中的错误处理使调试非常困难:

try:
    # too much stuff here
except: # bare except
    print "Could not parse page %s" % page # generic message

虽然非常稳定(即如果出现任何问题,程序仍在运行),这使得无法弄清楚出了什么问题,你所知道的是try块中的13行之一出错了。使用较短的try块重构代码的这一部分,并测试特定错误(请参阅"the evils of except")。

您可以尝试在没有任何错误处理的情况下运行(注释掉try: except:print ...行,然后查看try块中当前的行并阅读特定的错误回溯可以帮助您,然后在以后进行适当的错误处理。