在Python中下载多个文件时出错

时间:2012-12-08 13:00:43

标签: python networking download urllib2

有谁能告诉我,我做错了什么?我一直在使用此代码收到错误。

我正试图从 primaryschoolgames 下载所有swf,就像一个实验,但我似乎无法做到:

#!/usr/bin/env python
# encoding: utf-8

import sys, getopt
import os, urllib, urllib2, re, string, math

help_message = '''
'''
no_param = '''
'''

verbose = False
fakeMode = False
curPath = os.getcwd() + "/"

urlRegex = ''
FileRegex = ''
outputPath = ''
currentFile = ''

def removeDuplicates(seq):
# Not order preserving
keys = {}
for e in seq:
    keys[e] = 1
return keys.keys()

def go(filename):
print "Having a look at " + string.capwords(filename)

global urlRegex, FileRegex, outputPath, currentFile

url = 'http://cdn.primarygames.com' + filename

urlRegex = '/'+filename+'/.+/download'
FileRegex = '/'+filename+'/(.*?)/download'
outputPath = curPath+"Swfs"+"/"

if not os.path.exists(outputPath):
    os.makedirs(outputPath)

filelist = []

while(len(url)):
    # looping system
    newlist, url = scrapePage(url, filename)
    filelist.extend(newlist)

print 'Found %s Files.' % len(filelist)

for swf in filelist:
    swfurl = swf['url']
    name = swf['name']
    currentFile = name
    #print 'Downloading '+name,
    if not fakeMode:
        #print ''
        urllib.urlretrieve('http://cdn.primarygames.com' + swfurl, outputPath+name)
    else:
        print 'Not downloading %s.' % name
print "All done with %s!" % filename

def scrapePage(url, filename):
print 'Looking through '+url
html = urllib2.urlopen(url).read()
swflist = re.findall(urlRegex, html)
swflist = removeDuplicates(swflist)

swfs = []

for swfurl in swflist:
    r = re.compile(FileRegex)
    swfname = r.search(swfurl).group(1)
    swfname = swfname.replace('-', ' ')
    name = filename + "/" + swfname + ".swf"
    name = string.capwords(name)
    swf.append({'name':name,'url':swfurl})

r = re.compile(nextRegex)
result = r.search(html)
if result:
    nextUrl = 'http://cdn.primarygames.com' + result.group(1)
else:
    nextUrl = ''

return swfs, nextUrl


def main(argv=None):
global verbose, fakeMode

if argv is None:
    argv = sys.argv
try:
    try:
        opts, args = getopt.getopt(argv[1:], "ho:vf", ["help", "output="])
    except getopt.error, msg:
        raise Usage(msg)

    # option processing
    for option, value in opts:
        if option == "-v":
            verbose = True
        if option in ("-f", "--fake"):
            fakeMode = True
        if option in ("-h", "--help"):
            raise Usage(help_message)
        if option in ("-o", "--output"):
            output = value

    if len(args):
        swfs = args
    else:
        raise Usage(no_param)

except Usage, err:
    print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
    if err.msg != help_message:
        print >> sys.stderr, "\t for help use --help"
    return 2

for swf in swfs:
    go(swf)


if __name__ == "__main__":
sys.exit(main())

这是我不断得到的错误:

Having a look at *
Looking through http://cdn.primarygames.com/*
Traceback (most recent call last):
File "C:\PrimarySchoolGames Swf Downloader.py"
, line 129, in <module>
sys.exit(main())
File "C:\PrimarySchoolGames Swf Downloader.py"
, line 125, in main
go(swf)
File "C:\PrimarySchoolGames Swf Downloader.py"
, line 48, in go
newlist, url = scrapePage(url, filename)
File "C:\Users\Terrii\Desktop\VB Extra's\PrimarySchoolGames Swf Downloader.py"
, line 67, in scrapePage
html = urllib2.urlopen(url).read()
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 400, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 418, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 378, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1207, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "C:\Python27\lib\urllib2.py", line 1177, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 11004] getaddrinfo failed>

1 个答案:

答案 0 :(得分:1)

失败的getaddrinfo通常表示您提供的网址存在问题。既然我能够解析地址你确定你不在代理服务器后面吗?这可能导致DNS查找失败,从而导致此消息。

Python如何确定在Windows上使用哪个代理:

  

在Windows环境中,如果未设置代理环境变量,   代理设置是从注册表的Internet设置获得的   部分。

为了获得更多帮助,我同意@MikeHunter。我试图修复你的代码,但由于我必须实现你的Exception-Class以使代码运行,我认为你应该重新缩进代码并提供更多信息。遗憾。