无效的网址引发异常 - python

时间:2011-06-20 16:25:00

标签: python web-crawler

import httplib
import urlparse

def getUrl(url):
   try:
     parts = urlparse.urlsplit(url)
     server = parts[1]
     path = parts[2]
     obj = httplib.HTTPConnection(server,80)
     obj.connect()
     obj.putrequest('HEAD',path)
     obj.putheader('Accept','*/*')
     obj.endheaders()
     response = obj.getresponse()
     contentType = response.getheader("content-type", "unknown")
     obj.close()
     if response.status !=200:
       print 'Error'
     else:
       print 'Awesome'
   except Exception, e:
     print e

我编写了上面的代码来检查给定的URL是否有效。但不知何故,当我测试它时,对于每个无效的url,它会引发异常。

>>> getUrl('http://www.idlebrfffain.com')
[Errno -2] Name or service not known

Python版本:

chaitu@ubuntu:~$ python -V
Python 2.6.4

任何人都可以帮我找出错误究竟在哪里吗?

3 个答案:

答案 0 :(得分:2)

应该发生这种情况。抛出异常是因为无法解析URL。这是在if response.status != 200行之前抛出的,它会将控制权转移到except块。

您需要花些时间研究Exceptions的工作原理。这是你可以尝试的一个例子。

def getUrl(url):
    status = None
    try:
        # do your normal stuff...
        status = response.status
    except Exception, e:
        # do whatever you want here...
        pass
    finally:
        if status != 200:
            print "Error"
        else:
            print "Awesome"

答案 1 :(得分:2)

你必须抓住socket.error

import httplib, socket
import urlparse

def getUrl(url):
    parts = urlparse.urlsplit(url)
    server = parts[1]
    path = parts[2]
    obj = httplib.HTTPConnection(server,80)

    try:
        obj.connect()
    except socket.gaierror:
        print "Host %s does not exist" % server
        return
    except socket.error:
        print "Cannot connect to %s:%s." % (server, 80)
        return

    obj.putrequest('HEAD',path)
    obj.putheader('Accept','*/*')
    obj.endheaders()
    response = obj.getresponse()
    contentType = response.getheader("content-type", "unknown")
    obj.close()
    if response.status !=200:
        print 'Error'
    else:
        print 'Awesome'


getUrl('http://www.idlebrfffain.com') # not a registered domain
getUrl('http://8.8.8.8') # not a http server

仅在特定行周围try: except:且仅在您知道发生了什么的情况下。 Python将向您展示未捕获异常的回溯,因此您可以轻松找到问题所在。

答案 2 :(得分:1)

#The following code validates a url. This is a 2 step process, to do that. First I validate the domain and next the path attached to the domain.
from urlparse import urlparse
import urllib2
import socket
class ValidateURL:
    def __init__(self, url):
        self._url = url

    def startActivity(self):
        self._parts = urlparse(self._url)
        a = self._checkDomain(self._parts[1])
        if a:
            b = self._checkUrl(self._url)
            if b == 1:
                print self._url,' is valid'
            else:
                print 'The path ',self._parts[2],' is not valid'
        else:
            print self._parts[1],' domain does not exist'

    #Checks whether the domain is right or not
    def _checkDomain(self,domain):
        x = 1
        try:
            socket.gethostbyname_ex(domain)
        except socket.gaierror:
            x = 0
        except socket.error:
            x = 0
        finally:
            return x

    #Checks whether the path is right or not
    def _checkUrl(self,url):
        x = 1
        self._req = urllib2.Request(url)
        try: 
            urllib2.urlopen(self._req)
        except urllib2.URLError, e:
            #print e
            x = 0
        finally:
            return x

if __name__ == "__main__":
    valid = ValidateURL('http://stackoverflow.com/questions/invalid-urls-throw-an-exception-python')
    valid.startActivity()
    valid1 = ValidateURL('http://stackoverflow.com/questions/6414417/invalid-urls-throw-an-exception-python')
    valid1.startActivity()

希望我得出的解决方案是明智的。