我是Python的新手,可以使用一些帮助。我正在尝试编写一个脚本,该脚本将发送到特定的网站,并在该站点的不同位置下载多个.gif图像。任何人都可以帮助我找到正确的方向。这是我试图做的第一个。
这是我到目前为止所得到的。
from http:// import http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/ as bs
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
import sys
def main(url, out_folder="C:\Users\jerry\Desktop\Heli/"):
"""Downloads all the images at 'url' to /test/"""
http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/ = bs(urlopen(url))
parsed = list(urlparse.urlparse(url))
for image in http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/.findAll("gif"):
print "gif: %(src)s" % image
filename = gif["src"].split("/")[-1]
parsed[2] = gif["src"]
outpath = os.path.join(out_folder, filename)
if gif["src"].lower().startswith("http"):
urlretrieve(gif["src"], outpath)
else:
urlretrieve(urlparse.urlunparse(parsed), outpath)
def _usage():
print "usage: python dumpimages.py http:folkworm.ceri.memphis.edu/heli/heli_bb_ag/ [outpath]"
if __name__ == "__main__":
url = sys.argv[-1]
out_folder = "/test/"
if not url.lower().startswith("http"):
out_folder = sys.argv[-1]
url = sys.argv[-2]
if not url.lower().startswith("http"):
_usage()
sys.exit(-1)
main(url, out_folder)
答案 0 :(得分:1)
这是基本想法。
>>> import requests
>>> from bs4 import BeautifulSoup
>>> item = requests.get('http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/')
>>> page = item.text
>>> soup = BeautifulSoup(page, 'lxml')
>>> links = soup.findAll('a')
>>> for link in links:
... if '.gif' in link.attrs['href']:
... print (link.attrs['href'])
... break
...
CCAR_HHZ_AG_00.2017012700.gif?v=1485534942
break 语句只是为了打断脚本,因此它不会打印gif的所有名称。下一步是将代码添加到该循环,以将requests.get中提到的URL连接到每个gif的名称,并为其执行requests.get。这次虽然你会用image = item.content来获取以字节为单位的图像,你可以将其写入你选择的文件中。
编辑:充实。请注意,您仍需要安排为每个输出文件提供一个文件名。
>>> import requests
>>> from bs4 import BeautifulSoup
>>> URL = 'http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/'
>>> item = requests.get(URL)
>>> page = item.text
>>> soup = BeautifulSoup(page, 'lxml')
>>> links = soup.findAll('a')
>>> for link in links:
... if '.gif' in link.attrs['href']:
... print (link.attrs['href'])
... pic = requests.get(URL + link.attrs['href'])
... image = pic.content
... open('pic.gif', 'wb').write(image)
... break
...
CCAR_HHZ_AG_00.2017012700.gif?v=1485535857
100846