我是python的新手,我正在开发一个网络爬虫,下面是从给定网址获取链接的程序,但问题是我不希望它访问已经访问过的同一个网址。请帮帮我。
import re
import urllib.request
import sqlite3
db = sqlite3.connect('test2.db')
db.row_factory = sqlite3.Row
db.execute('drop table if exists test')
db.execute('create table test(id INTEGER PRIMARY KEY,url text)')
#linksList = []
#module to vsit the given url and get the all links in that page
def get_links(urlparse):
try:
if urlparse.find('.msi') ==-1: #check whether the url contains .msi extensions
htmlSource = urllib.request.urlopen(urlparse).read().decode("iso-8859-1")
#parsing htmlSource and finding all anchor tags
linksList = re.findall('<a href=(.*?)>.*?</a>',htmlSource) #returns href and other attributes of a tag
for link in linksList:
start_quote = link.find('"') # setting start point in the link
end_quote = link.find('"', start_quote + 1) #setting end point in the link
url = link[start_quote + 1:end_quote] # get the string between start_quote and end_quote
def concate(url): #since few href may return only /contact or /about so concatenating its baseurl
if url.find('http://'):
url = (urlparse) + url
return url
else:
return url
url_after_concate = concate(url)
# linksList.append(url_after_concate)
try:
if url_after_concate.find('.tar.bz') == -1: # skipping links which containts link to some softwares or downloads page
db.execute('insert or ignore into test(url) values (?)', [url_after_concate])
except:
print("insertion failed")
else:
return True
except:
print("failed")
get_links('http://www.python.org')
cursor = db.execute('select * from test')
for row in cursor: # retrieve the links stored in database
print (row['id'],row['url'])
urlparse = row['url']
# print(linksList)
# if urlparse in linksList == -1:
try:
get_links(urlparse) # again parse the link from database
except:
print ("url error")
请告诉我如何解决问题的方法。
答案 0 :(得分:1)
您应该有一个“已访问”页面列表。当您来请求下一个URL时,您可以检查列表是否已包含该URL,如果是,则跳过它。我不是python程序员,所以这里有一些peusdo-code
Create listOfVisitedUrls
...
Start Loop
Get nextUrl
If nextUrl IsNotIn listOfVisitedUrls Then
Request nextUrl
Add nextUrl to listOfVisitedUrls
End If
Loop
答案 1 :(得分:0)
您可以使用以下代码:
import re
from urllib import urlopen
# Since few href may return only /contact or /about, concatenate to baseurl.
def concat(url, baseurl):
if url.find('http://'):
url = baseurl + url
return url
else:
return url
def get_links(baseurl):
resulting_urls = set()
try:
# Check whether the url contains .msi extensions.
if baseurl.find('.msi') == -1:
# Parse htmlSource and find all anchor tags.
htmlSource = urlopen(baseurl).read()
htmlSource = htmlSource.decode("iso-8859-1")
# Returns href and other attributes of a tag.
linksList = re.findall('<a href=(.*?)>.*?</a>',htmlSource)
for link in linksList:
# Setting start and end points in the link.
start_quote = link.find('"')
end_quote = link.find('"', start_quote + 1)
# Get the string between start_quote and end_quote.
url = link[start_quote + 1:end_quote]
url_after_concat = concat(url, baseurl)
resulting_urls.add(url_after_concat)
else:
return True
except:
print("failed")
return resulting_urls
get_links('http://www.python.org')
它会返回包含set()
唯一网址的baseurl
;对于`http://www.python.org',你应该得到:
set([u'http://www.python.org/download/',
u'http://docs.python.org/',
u'http://www.python.org#left-hand-navigation',
u'http://wiki.python.org/moin/PyQt',
u'http://wiki.python.org/moin/DatabaseProgramming/',
u'http://roundup.sourceforge.net/',
u'http://www.python.org/ftp/python/3.2.3/Python-3.2.3.tar.bz2',
u'http://www.python.org/about/website',
u'http://www.python.org/about/quotes',
u'http://www.python.org/community/jobs/',
u'http://www.python.org/psf/donations/',
u'http://www.python.org/about/help/',
u'http://wiki.python.org/moin/CgiScripts',
u'http://www.zope.org/',
u'http://www.pygame.org/news.html',
u'http://pypi.python.org/pypi',
u'http://wiki.python.org/moin/Python2orPython3',
u'http://www.python.org/download/releases/2.7.3/',
u'http://www.python.org/ftp/python/3.2.3/python-3.2.3.msi',
u'http://www.python.org/community/',
u'http://www.python.org/ftp/python/2.7.3/Python-2.7.3.tar.bz2',
u'http://wiki.python.org/moin/WebProgramming',
u'http://www.openbookproject.net/pybiblio/',
u'http://twistedmatrix.com/trac/',
u'http://wiki.python.org/moin/IntegratedDevelopmentEnvironments',
u'http://www.pentangle.net/python/handbook/',
u'http://wiki.python.org/moin/TkInter',
u'http://www.vrplumber.com/py3d.py',
u'http://sourceforge.net/projects/mysql-python',
u'http://wiki.python.org/moin/GuiProgramming',
u'http://www.python.org/about/',
u'http://www.edgewall.com/trac/',
u'http://osl.iu.edu/~lums/swc/',
u'http://www.python.org/community/merchandise/',
u"http://www.python.org'/psf/",
u'http://wiki.python.org/moin/WxPython',
u'http://docs.python.org/3.2/',
u'http://www.python.org#content-body',
u'http://www.python.org/getit/',
u'http://www.python.org/news/',
u'http://www.python.org/search',
u'http://www.python.org/community/sigs/current/edu-sig',
u'http://www.python.org/about/legal',
u'http://www.timparkin.co.uk/',
u'http://www.python.org/about/apps',
u'http://www.turbogears.org/',
u'http://www.egenix.com/files/python/mxODBC.html',
u'http://docs.python.org/devguide/',
u'http://docs.python.org/howto/sockets.html',
u'http://www.djangoproject.com/',
u'http://buildbot.net/trac',
u'http://www.python.org/psf/',
u'http://www.python.org/doc/',
u'http://wiki.python.org/moin/Languages',
u'http://www.xs4all.com/',
u'http://www.python.org/',
u'http://wiki.python.org/moin/NumericAndScientific',
u'http://www.python.org/channews.rdf',
u'http://www.alobbs.com/pykyra',
u'http://wiki.python.org/moin/PythonXml',
u'http://wiki.python.org/moin/PyGtk',
u'http://www.python.org/ftp/python/2.7.3/python-2.7.3.msi',
u'http://www.python.org/download/releases/3.2.3/',
u'http://www.python.org/3kpoll'])
希望有所帮助。