Question

我无法自动检索Youtube视频。这是代码。问题是最后一部分。 download = urllib.request.urlopen(download_url).read()

    # Youtube video download script
    # 10n1z3d[at]w[dot]cn

    import urllib.request
    import sys

    print("\n--------------------------")
    print (" Youtube Video Downloader")
    print ("--------------------------\n")

    try:
            video_url = sys.argv[1]
    except:
            video_url = input('[+] Enter video URL: ')

    print("[+] Connecting...")
    try:
            if(video_url.endswith('&feature=related')):
                    video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=related')[0]
            elif(video_url.endswith('&feature=dir')):
                    video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=dir')[0]
            elif(video_url.endswith('&feature=fvst')):
                    video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=fvst')[0]
            elif(video_url.endswith('&feature=channel_page')):
                    video_id = video_url.split('www.youtube.com/watch?v=')[1].split('&feature=channel_page')[0]
            else:
                    video_id = video_url.split('www.youtube.com/watch?v=')[1]
    except:
            print("[-] Invalid URL.")
            exit(1)       
    print("[+] Parsing token...")
    try:
            url = str(urllib.request.urlopen('http://www.youtube.com/get_video_info?&video_id=' + video_id).read())
            token_value = url.split('video_id='+video_id+'&token=')[1].split('&thumbnail_url')[0]

            download_url = "http://www.youtube.com/get_video?video_id=" + video_id + "&t=" + token_value + "&fmt=18"
    except:
            url = str(urllib.request.urlopen('www.youtube.com/watch?v=' + video_id))
            exit(1)

    v_url=str(urllib.request.urlopen('http://'+video_url).read())   
    video_title = v_url.split('"rv.2.title": "')[1].split('", "rv.4.rating"')[0]
    if '&quot;' in video_title:
            video_title = video_title.replace('&quot;','"')
    elif '&amp;' in video_title:
            video_title = video_title.replace('&amp;','&')

    print("[+] Downloading " + '"' + video_title + '"...')
    try:
            print(download_url)
            file = open(video_title + '.mp4', 'wb')
            download = urllib.request.urlopen(download_url).read()
            print(download)
            for line in download:
                    file.write(line)
                    file.close()
    except:
            print("[-] Error downloading. Quitting.")
            exit(1)

    print("\n[+] Done. The video is saved to the current working directory(cwd).\n")

有一条错误消息:(感谢Wooble）

Traceback (most recent call last):
  File "C:/Python31/MyLib/DrawingBoard/youtube_download-.py", line 52, in <module>
    download = urllib.request.urlopen(download_url).read()
  File "C:\Python31\lib\urllib\request.py", line 119, in urlopen
    return _opener.open(url, data, timeout)
  File "C:\Python31\lib\urllib\request.py", line 353, in open
    response = meth(req, response)
  File "C:\Python31\lib\urllib\request.py", line 465, in http_response
    'http', request, response, code, msg, hdrs)
  File "C:\Python31\lib\urllib\request.py", line 385, in error
    result = self._call_chain(*args)
  File "C:\Python31\lib\urllib\request.py", line 325, in _call_chain
    result = func(*args)
  File "C:\Python31\lib\urllib\request.py", line 560, in http_error_302
    return self.parent.open(new, timeout=req.timeout)
  File "C:\Python31\lib\urllib\request.py", line 353, in open
    response = meth(req, response)
  File "C:\Python31\lib\urllib\request.py", line 465, in http_response
    'http', request, response, code, msg, hdrs)
  File "C:\Python31\lib\urllib\request.py", line 391, in error
    return self._call_chain(*args)
  File "C:\Python31\lib\urllib\request.py", line 325, in _call_chain
    result = func(*args)
  File "C:\Python31\lib\urllib\request.py", line 473, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden

Answer 1

原始问题的代码依赖于关于youtube页面和url内容的几个假设（用“url.split（'something ='）[1]”等结构表示），这些假设可能并非总是如此。我测试了它，甚至可能取决于页面上显示的相关视频。你可能已经绊倒了这些特殊情况。

这是一个更干净的版本，它使用urllib来解析网址和查询字符串，并成功下载视频。为清楚起见，我删除了一些尝试/除了没有做太多但是退出的尝试。顺便提一下，它通过从保存视频的文件名中删除非ascii字符来处理unicode视频标题。它还需要任意数量的youtube网址并将其全部下载。最后，它将其用户代理屏蔽为Chrome for Mac（这是我目前使用的）。

#!/usr/bin/env python3

import sys
import urllib.request
from urllib.request import urlopen, FancyURLopener
from urllib.parse import urlparse, parse_qs, unquote

class UndercoverURLopener(FancyURLopener):
    version = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.9 Safari/533.2"
urllib.request._urlopener = UndercoverURLopener()

def youtube_download(video_url):
    video_id = parse_qs(urlparse(video_url).query)['v'][0]

    url_data = urlopen('http://www.youtube.com/get_video_info?&video_id=' + video_id).read()
    url_info = parse_qs(unquote(url_data.decode('utf-8')))
    token_value = url_info['token'][0]

    download_url = "http://www.youtube.com/get_video?video_id={0}&t={1}&fmt=18".format(
        video_id, token_value)

    video_title = url_info['title'][0] if 'title' in url_info else ''
    # Unicode filenames are more trouble than they're worth
    filename = video_title.encode('ascii', 'ignore').decode('ascii').replace("/", "-") + '.mp4'

    print("\t Downloading '{}' to '{}'...".format(video_title, filename))

    try:
        download = urlopen(download_url).read()
        f = open(filename, 'wb')
        f.write(download)
        f.close()
    except Exception as e:
        print("\t Downlad failed! {}".format(str(e)))
        print("\t Skipping...")
    else:
        print("\t Done.")

def main():
    print("\n--------------------------")
    print (" Youtube Video Downloader")
    print ("--------------------------\n")

    try:
        video_urls = sys.argv[1:]
    except:
        video_urls = input('Enter (space-separated) video URLs: ')

    for u in video_urls:
        youtube_download(u)
    print("\n Done.")

if __name__ == '__main__':
    main()

Answer 2

我将无耻地插入my script自动检查有效格式，自动选择最佳质量格式的视频，并适用于YouTube页面的Flash和html5变体（以及Vimeo）

如果您编写了该脚本，那么请查看我的源代码以获取灵感，并随意窃取一些代码。我挑战你，请写一些更好的东西。开源在竞争中茁壮成长！

但是，如果您复制了该脚本并且只是想让它运行起来，我建议您尝试一下我的脚本，看看它是否适合您。您可以从命令行作为脚本访问它，也可以作为另一个python文件中的模块访问它。

（编辑：制作wiki。不寻找声誉。）

Answer 3

您还可以查看用Python编写的youtube-dl并查看其编写方式：

https://github.com/rg3/youtube-dl

Answer 4

看起来YouTube家伙已经更改了访问视频文件的算法。而不是“令牌”，他们现在使用“签名”变量，“签名”似乎依赖于cookie存储的数据或客户端的IP地址（如果cookie禁用浏览器，如python-2中的urllib）。这是我提出的一个黑客（URL是IP锁定的）：

#!/usr/bin/python

import re
from urlparse import *
from urllib import *

def yt_url(video_url):
    video_id = parse_qs(urlparse(video_url).query)['v'][0]

    get_vars = parse_qs(unquote(urlopen("http://www.youtube.com/get_video_info?video_id="+video_id).read()))

    url = get_vars["id"][0].split(",")[1].split("|")[1]

    elements = dict()
    elements["itag"] = get_vars["itag"][0]
    elements["sver"] = get_vars["sver"][0]
    elements["expire"] = get_vars["expire"][0]
    elements["signature"] = get_vars["signature"][0]
    elements["factor"] = get_vars["factor"][0]
    elements["id"] = get_vars["id"][0].split(",")[0]
    elements["key"] = get_vars["key"][0]
    elements["burst"] = get_vars["burst"][0]
    elements["sparams"] = get_vars["sparams"][0]
    elements["algorithm"] = get_vars["algorithm"][0]
    elements["ipbits"] = "8"

    for get_var in elements:
      url += "&" + get_var + "=" + elements[get_var]

    return (get_vars["title"][0], url)

if __name__ == '__main__':
    (title, url) = yt_url("http://www.youtube.com/watch?v=4tAr7tuakt0")
    print "Title: %s" % (title,)
    print "Video: %s" % (url,)

Answer 5

#!/usr/bin/env python
import urllib2,urllib
import re
import os
import sys
import time
linkurl=raw_input('Enter url:')
linkurl1=urllib.urlopen(linkurl).read()
file1=open("index.html","w")
file1.write(linkurl1)
file1.close()
fname = 'index.html'
## Giving new matrix value to find
find=("yt.playerConfig =",'"title":')
## File reading programme
with open(fname) as infile:
       for line_no, line in enumerate(infile, 1):
           lline = line.lower()
           if any(word.lower() in lline for word in find):
               y=line.rstrip()
fileurl=y
y1=y.replace("%3A%2F%2F","://")
y2=y1.replace("%2F","/")
y3=y2.replace("%3F","?")
y4=y3.replace("%3D","=")
y5=y4.replace("%26","&")
y6=y5.replace("%252","%2")
y7=y6.replace("sig","&signature")

#Display video resolution information
print ""
print "Video resolution:"
print "[46=1080(.webm)]--[37=1080(.mp4)]--[35=480(.flv)]--[36=180(.3gpp)]"
print "[45=720(.webm) ]--[22=720(.mp4) ]--[34=360(.flv)]--[17=144(.3gpp)]"
print "[44=480(.webm) ]--[18=360(.mp4) ]--[5=240(.flv) ]"
print "[43=360(.webm) ]"
print ""
# Programme to get all itag list file 
itag = re.findall('itag=(\d+)',y)
print `"itag list= "`+`itag`
resol=raw_input("Type itag number: ")
# Programme to get filename file 
fname = 'index.html'
find = (' <title>','</title>')
with open(fname) as infile:
    for line_no, line in enumerate(infile, 1):
        lline = line.lower()
        if any(word.lower() in lline for word in find):
          y=line.rstrip()
fileurl1=y.split(">")[-2]
filename2=fileurl1.split('"')[-2]

if resol=='46':
   # Programme to get webm file in 1080 hd
   y1080_webm=re.findall(r'itag=46(.*?)\u0026quality=hd1080',y7)
   url_1080_webm1=re.findall(r'\\u0026url=(.*?)\\u0026type',`y1080_webm`)
   signature=re.findall(r'signature=(.*?)\\',`y1080_webm`)
   url_1080_webm2=`url_1080_webm1`.split("\\")[0]
   url_1080_webm=url_1080_webm2.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_1080_webm
   #print url_1080_webm
   ext=".webm"
elif resol=='37':
   # Programme to get mp4 file in 1080 hd
   y1080_mp4=re.findall(r'itag=37(.*?)\u0026quality=hd1080',y7)
   url_1080_mp41=re.findall(r'\\u0026url=(.*?)\\u0026type',`y1080_mp4`)
   signature=re.findall(r'signature=(.*?)\\',`y1080_mp4`)
   url_1080_mp42=`url_1080_mp41`.split("\\")[0]
   url_1080_mp4=url_1080_mp42.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_1080_mp4
   #print url_1080_mp4
   ext=".mp4"
elif resol=='45':
   # Programme to get webm file in 720 hd
   y720_webm=re.findall(r'itag=45(.*?)\u0026quality=hd720',y7)
   url_720_webm1=re.findall(r'\\u0026url=(.*?)\\u0026type',`y720_webm`)
   signature=re.findall(r'signature=(.*?)\\',`y720_webm`)
   url_720_webm2=`url_720_webm1`.split("\\")[0]
   url_720_webm=url_720_webm2.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_720_webm
   #print url_720_webm
   ext=".webm"
elif resol=='22':
   # Programme to get mp4 file in 720 hd
   y720_mp4=re.findall(r'itag=22(.*?)\u0026quality=hd720',y7)
   url_720_mp41=re.findall(r'\\u0026url=(.*?)\\u0026type',`y720_mp4`)
   signature=re.findall(r'signature=(.*?)\\',`y720_mp4`)
   url_720_mp42=`url_720_mp41`.split("\\")[0]
   url_720_mp4=url_720_mp42.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_720_mp4
   #print url_720_mp4
   ext=".mp4"
elif resol=='44':
   # Programme to get webm file in 480 large 
   y480_webm=re.findall(r'itag=44(.*?)\u0026quality=large',y7)
   url_480_webm1=re.findall(r'\\u0026url=(.*?)\\u0026type',`y480_webm`)
   signature=re.findall(r'signature=(.*?)\\',`y480_webm`)
   url_480_webm2=`url_480_webm1`.split("\\")[0]
   url_480_webm=url_480_webm2.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_480_webm
   #print url_480_webm
   ext=".webm"
elif resol=='35':
   # Programme to get flv file in 480 large 
   y480_flv=re.findall(r'itag=35(.*?)\u0026quality=large',y7)
   url_480_flv1=re.findall(r'\\u0026url=(.*?)\\',`y480_flv`)
   signature=re.findall(r'signature=(.*?)\\',`y480_flv`)
   url_480_flv2=`url_480_flv1`.split("\\")[0]
   url_480_flv=url_480_flv2.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_480_flv
   #print url_480_flv
   ext=".flv"
elif resol=='43':
   # Programme to get webm file in 360 medium 
   y360_webm=re.findall(r'itag=43(.*?)\u0026quality=medium',y7)
   url_360_webm1=re.findall(r'\\u0026url=(.*?)\\',`y360_webm`)
   signature=re.findall(r'signature=(.*?)\\',`y360_webm`)
   url_360_webm2=`url_360_webm1`.split("\\")[0]
   url_360_webm=url_360_webm2.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_360_webm
   #print url_360_webm
   ext=".webm"
elif resol=='34':
   # Programme to get flv file in 360 medium
   y360_flv=re.findall(r'itag=34(.*?)\u0026quality=medium',y7)
   url_360_flv1=re.findall(r'\\u0026url=(.*?)\\',`y360_flv`)
   signature=re.findall(r'signature=(.*?)\\',`y360_flv`)
   url_360_flv2=`url_360_flv1`.split("\\")[0]
   url_360_flv=url_360_flv2.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_360_flv
   #print url_360_flv
   ext=".flv"
elif resol=='18':
   # Programme to get mp4 file in 360 medium
   y360_mp4=re.findall(r'itag=18(.*?)\u0026quality=medium',y7)
   url_360_mp41=re.findall(r'\\u0026url=(.*?)\\',`y360_mp4`)
   signature=re.findall(r'signature=(.*?)\\',`y360_mp4`)
   url_360_mp42=`url_360_mp41`.split("\\")[0]
   url_360_mp4=url_360_mp42.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_360_mp4
   #print url_360_mp4
   ext=".mp4"
elif resol=='5':
   # Programme to get flv file in 240 small
   y240_flv=re.findall(r'itag=5(.*?)\u0026quality=small',y7)
   url_240_flv1=re.findall(r'\\u0026url=(.*?)\\',`y240_flv`)
   signature=re.findall(r'signature=(.*?)\\',`y240_flv`)
   url_240_flv2=`url_240_flv1`.split("\\")[0]
   url_240_flv=url_240_flv2.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_240_flv
   #print url_240_flv
   ext=".flv"
elif resol=='36':
   # Programme to get 3gpp file in 180 small
   y180_3gpp=re.findall(r'itag=36(.*?)\u0026quality=small',y7)
   url_180_3gpp1=re.findall(r'\\u0026url=(.*?)\\',`y180_3gpp`)
   signature=re.findall(r'signature=(.*?)\\',`y180_3gpp`)
   url_180_3gpp2=`url_180_3gpp1`.split("\\")[0]
   url_180_3gpp=url_180_3gpp2.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_180_3gpp
   #print url_180_3gpp
   ext=".3gpp"
elif resol=='17':
   # Programme to get 3gpp file in 144 small
   y144_3gpp=re.findall(r'itag=17(.*?)\u0026quality=small',y7)
   url_144_3gpp1=re.findall(r'\\u0026url=(.*?)\\',`y144_3gpp`)
   signature=re.findall(r'signature=(.*?)\\',`y144_3gpp`)
   url_144_3gpp2=`url_144_3gpp1`.split("\\")[0]
   url_144_3gpp=url_144_3gpp2.split("'")[1]+"&signature="+`signature`.split("'")[1]+"&ptk=machinima"
   url=url_144_3gpp
   #print url_144_3gpp
   ext=".3gpp" 
#newindex=open("index1.txt",'w')
#newindex.write(y7)
print url
filename=filename2+ext
print filename
req = urllib2.Request(url, headers={'Range':"bytes=0-838860800"})

data = urllib2.urlopen(req)
print "connected to ""http://"+url.split("/")[2]+"/"
f=open(filename,'wb')
meta_data = data.info()
file_size = int(meta_data.getheaders("Content-Length")[0])
print "filesize= "+`file_size/1048576`+" MB"
bytes_received = 0
chunk_size = 10240
while True:
    start_time = time.time()
    buffer = data.read(chunk_size)
    if not buffer:
            break

    bytes_received += len(buffer)
    f.write(buffer)
    Td=time.time() - start_time
    speed1=round(len(buffer)/1024.0,1)
    speed=round(speed1/Td,1)
    speed_MB=round(speed/1024.0,1)
    speed_GB=round(speed_MB/1024.0,1)    
    bytes_received_MB=round(bytes_received/1048576.0,3)
    percent = bytes_received * 100. / file_size 
    if speed < 1:
       speed_byte=round(len(buffer)/Td,1)
       Tr=(file_size-bytes_received)/(60*speed_byte)  
       status = r"[Downloaded=%.3f MB] [%3.2f%%]  [speed= %.1f B/s] [eta %1d min]             " % (bytes_received_MB, percent,speed_byte,Tr)
    elif speed < 1024:
       Tr=(file_size-bytes_received)/(60*1024*speed)
       status = r"[Downloaded=%.3f MB] [%3.2f%%]  [speed= %.1f KB/s] [eta %1d min]            " % (bytes_received_MB, percent,speed,Tr)
    elif speed < 1048576 :
       Tr=(file_size-bytes_received)/(60*1024*1024*speed_MB)
       status = r"[Downloaded=%.3f MB] [%3.2f%%]  [speed= %.1f MB/s] [eta %1d min]            " % (bytes_received_MB, percent,speed_MB,Tr)
    else :
       Tr=(file_size-bytes_received)/(60*1024*1024*1024*speed_GB)
       status = r"[Downloaded=%.3f MB] [%3.2f%%]  [speed= %.1f GB/s] [eta %1d min]            " % (bytes_received_MB, percent,speed_GB,Tr)


    status = status + chr(8) * (len(status) + 1)
    print status,

无法下载youtube视频

5 个答案: