尽管我在过去几个小时内付出了最大的努力,但我无法运行爬虫(名为searchengine.py
)。它似乎无法成功索引页面。我会给你完整的爬虫代码。我收到的那种错误如下所示
Indexing http://www.4futureengineers.com/company.html
Could not parse page http://www.4futureengineers.com/company.html
我通过在Python交互式会话(shell)中输入以下命令来调用searchengine.py
。
>> import searchengine
>> crawler=searchengine.crawler('searchindex.db')
>> pages= \
.. ['http://www.4futureengineers.com/company.html']
>> crawler.crawl(pages)
它给出了错误,即在命令crawler.crawl(pages)
以下是searchengine.py
的完整源代码import urllib2
from BeautifulSoup import *
from urlparse import urljoin
from pysqlite2 import dbapi2 as sqlite
# Create a list of words to ignore
ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1}
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
self.con=sqlite.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self,table,field,value,createnew=True):
cur=self.con.execute(
"select rowid from %s where %s='%s'" % (table,field,value))
res=cur.fetchone()
if res==None:
cur=self.con.execute(
"insert into %s (%s) values ('%s')" % (table,field,value))
return cur.lastrowid
else:
return res[0]
# Index an individual page
def addtoindex(self,url,soup):
if self.isindexed(url): return
print 'Indexing '+url
# Get the individual words
text=self.gettextonly(soup)
words=self.separatewords(text)
# Get the URL id
urlid=self.getentryid('urllist','url',url)
# Link each word to this url
for i in range(len(words)):
word=words[i]
if word in ignorewords: continue
wordid=self.getentryid('wordlist','word',word)
self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i))
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
v=soup.string
if v==Null:
c=soup.contents
resulttext=''
for t in c:
subtext=self.gettextonly(t)
resulttext+=subtext+'\n'
return resulttext
else:
return v.strip()
# Seperate the words by any non-whitespace character
def separatewords(self,text):
splitter=re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s!='']
def isindexed(self,url):
u=self.con.execute \
("select rowid from urllist where url='%s'" % url).fetchone()
if u!=None:
#Check if it has actually been crawled
v=self.con.execute(
'select * from wordlocation where urlid=%d' % u[0]).fetchone()
if v!=None: return True
return False
def crawl(self,pages,depth=2):
for i in range(depth):
newpages={}
for page in pages:
try:
c=urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
try:
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages[url]=1
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit()
except:
print "Could not parse page %s" % page
pages=newpages
# Create the database tables
def createindextables(self):
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid,wordid,location)')
self.con.execute('create table link(fromid integer,toid integer)')
self.con.execute('create table linkwords(wordid,linkid)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index wordurlidx on wordlocation(wordid)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.dbcommit()
答案 0 :(得分:1)
crawl
中的错误处理使调试非常困难:
try:
# too much stuff here
except: # bare except
print "Could not parse page %s" % page # generic message
虽然非常稳定(即如果出现任何问题,程序仍在运行),这使得无法弄清楚出了什么问题,你所知道的是try
块中的13行之一出错了。使用较短的try
块重构代码的这一部分,并测试特定错误(请参阅"the evils of except")。
您可以尝试在没有任何错误处理的情况下运行(注释掉try:
except:
和print ...
行,然后查看try
块中当前的行并阅读特定的错误回溯可以帮助您,然后在以后进行适当的错误处理。