无法使用urllib.urlopen获取以“https://”开头的Url

时间:2015-10-02 07:34:20

标签: python beautifulsoup urllib

我的程序适用于以http://开头的网址,但如果https:与正则表达式相匹配则会给我一个错误,指出Name or services unknown。如果是,可以urllib打开https://怎么办?如果没有我修改我的代码。我应该使用什么。

import sys
import urllib
import urlparse
import re
from bs4 import BeautifulSoup
def process(url):
    proxies = {"http":"http://proxy4.nehu.ac.in:3128"}
    page  = urllib.urlopen(url)
    text = page.read()
    page.close()
    soup = BeautifulSoup(text)
    file=open('s.txt','w') 
    for tag in soup.findAll('a', href=True):
        tag['href'] = urlparse.urljoin(url, tag['href'])
        print tag['href']
        file.write('\n')
        file.write(tag['href'])
        file.close()
        file=open('s.txt','r')
    for line in file:
        if re.match(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))',line):
        print line
        page = urllib.urlopen(line)
        text = page.read()
        page.close()
        soup = BeautifulSoup(text)
    with open('newfile.txt','a') as file2:
            file2.write('\n ----------------- \n')
            file2.write(line)
            file2.write('\n ----------------- \n')
            for tag in soup.findAll('a', href=True):
                tag['href'] = urlparse.urljoin(line, tag['href'])
                print tag['href']

                file2.write('\n')
                file2.write(tag['href'])

       file.close()


       def main():
           if len(sys.argv) == 1:
              print 'No url !!'      
              sys.exit(1)
           for url in sys.argv[1:]:
              process(url)

      main()

这是我的txt文件中的网址:

https://support.makemytrip.com/MyAccount/MyTripReward/DashBoard

显示此错误:

Traceback (most recent call last):
File "myurl.py", line 67, in <module>
main()
File "myurl.py", line 65, in main
process(url)
File "myurl.py", line 40, in process
page = urllib.urlopen(line)
File "/usr/lib/python2.7/urllib.py", line 84, in urlopen
return opener.open(url)
File "/usr/lib/python2.7/urllib.py", line 205, in open
return getattr(self, name)(url)
File "/usr/lib/python2.7/urllib.py", line 342, in open_http
h.endheaders(data)
File "/usr/lib/python2.7/httplib.py", line 940, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 803, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 755, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 736, in connect
self.timeout, self.source_address)
File "/usr/lib/python2.7/socket.py", line 551, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno -2] Name or service not known

0 个答案:

没有答案