图像下载卡在不再存在的链接上

时间:2016-10-05 19:43:54

标签: python http urllib information-retrieval

所以我不知道如何处理这种情况。它几乎适用于许多其他断开的链接,但不是这个:

import datetime
import praw
import re
import urllib
import requests
from bs4 import BeautifulSoup




sub = 'dog'
imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')
r = praw.Reddit(user_agent = "download all images from a subreddit",
        user_site = "lamiastella")
already_done = []
#checkWords = ['i.imgur.com',  'jpg', 'png',]
check_words = ['jpg', 'jpeg', 'png']

subreddit = r.get_subreddit(sub)
for submission in subreddit.get_top_from_all(limit=10000):
#for submission in subreddit.get_hot(limit=10000):
        is_image = any(string in submission.url for string in check_words)
        print '[LOG] Getting url:  ' + submission.url
        if submission.id not in already_done and is_image:
       if submission.url.endswith('/'):
        modified_url = submission.url[:len(submission.url)-1]
                try:
            urllib.urlretrieve(modified_url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-5:])
            except Exception as e:
            print(e)
            #pass
            continue
           else:
        try:
                urllib.urlretrieve(submission.url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-5:])
        except Exception as e:
            print(e)
            #pass
            continue

           already_done.append(submission.id)
           print '[LOG] Done Getting ' + submission.url
           print('{0}: {1}'.format('submission id is', submission.id))
        elif 'imgur.com' in submission.url and not (submission.url.endswith('gif') 
                            or submission.url.endswith('webm')
                            or submission.url.endswith('mp4')
                            or submission.url.endswith('all')
                            or '#' in submission.url
                            or '/a/' in submission.url):
            # This is an Imgur page with a single image.
            html_source = requests.get(submission.url).text # download the image's page
            soup = BeautifulSoup(html_source, "lxml")
            image_url = soup.select('img')[0]['src']
            if image_url.startswith('//'):
                    # if no schema is supplied in the url, prepend 'http:' to it
                    image_url = 'http:' + image_url
                image_id = image_url[image_url.rfind('/') + 1:image_url.rfind('.')]
                        urllib.urlretrieve(image_url, '/home/jalal/computer_vision/image_retrieval/images/' + 'imgur_'+ datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-9:0])
    elif 'instagram.com' in submission.url:
        html_source = requests.get(submission.url).text 
        soup = BeautifulSoup(html_source, "lxml")
        instagram_url = soup.find('meta', {"property":"og:image"})['content']
        urllib.urlretrieve(instagram_url, '/home/jalal/computer_vision/image_retrieval/images/' + 'instagram_'+ datetime.datetime.now().strftime('%y-%m-%d-%s') + '.jpg')
    else:
        continue

我被困在一个链接上 http://cutearoo.com/wp-content/uploads/2011/04/Pomsky.png 并且必须使用CTL + C:

[LOG] Done Getting http://i.imgur.com/Vc9P9QC.jpg
submission id is: 1fv70j
[LOG] Getting url:  http://i.imgur.com/iOBi0qx.jpg
[LOG] Done Getting http://i.imgur.com/iOBi0qx.jpg
submission id is: 1dof3o
[LOG] Getting url:  http://cutearoo.com/wp-content/uploads/2011/04/Pomsky.png
^CTraceback (most recent call last):
  File "download_images.py", line 35, in <module>
    urllib.urlretrieve(submission.url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-5:])
  File "/usr/lib/python2.7/urllib.py", line 98, in urlretrieve
    return opener.retrieve(url, filename, reporthook, data)
  File "/usr/lib/python2.7/urllib.py", line 245, in retrieve
    fp = self.open(url, data)
  File "/usr/lib/python2.7/urllib.py", line 213, in open
    return getattr(self, name)(url)
  File "/usr/lib/python2.7/urllib.py", line 350, in open_http
    h.endheaders(data)
  File "/usr/lib/python2.7/httplib.py", line 1053, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python2.7/httplib.py", line 897, in _send_output
    self.send(msg)
  File "/usr/lib/python2.7/httplib.py", line 859, in send
    self.connect()
  File "/usr/lib/python2.7/httplib.py", line 836, in connect
    self.timeout, self.source_address)
  File "/usr/lib/python2.7/socket.py", line 566, in create_connection
    sock.connect(sa)
  File "/usr/lib/python2.7/socket.py", line 228, in meth
    return getattr(self._sock,name)(*args)
KeyboardInterrupt

请为此修改建议。

更新: 我使用了类似的东西:

image_file = urllib2.urlopen(modified_url)
with open('/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-5:], 'wb') as output_image:
                                output_image.write(image_file.read())

并且仍然因特定链接而陷入困境。

1 个答案:

答案 0 :(得分:1)

urlopentimeout参数一起使用:

>>> import urllib2
>>> modified_url = 'http://cutearoo.com/wp-content/uploads/2011/04/Pomsky.png'
>>> try:
...     image_file = urllib2.urlopen(modified_url, timeout=5)
... except urllib2.URLError:
...     print 'could not download :('
...
could not download :(
>>>

上面的答案是正确的:)只是根据你的答案添加我的答案;

image_file = urllib2.urlopen(modified_url)
                with open('/home/jalal/computer_vision/image_retrieval/'+category+'/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-5:], 'wb') as output_image:
                    output_image.write(image_file.read(), timeout = 5)
                except urllib2.URLError as e:
                   print(e)
                   continue