我无法在该位置保存/下载图像。尽管代码似乎正确,但我无法弄清楚问题。
我正在使用请求库来抓取图像。
import os
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
import re
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
title = fromstring(r.content).findtext('.//title')
#print(title)
newPath = r'C:\Users\Vicky\Desktop\ScrappedImages\ ' + title
for link in soup.find_all('img'):
image = link.get('src')
if 'http' in image:
print(image)
imageName = os.path.split(image)[1]
print(imageName)
r2 = requests.get(image)
if not os.path.exists(newPath):
os.makedirs(newPath)
with open(imageName, "wb") as f:
f.write(r2.content)
答案 0 :(得分:0)
尝试在<div class="img fadeIn" >
<img src="logo.png" height="90px" width="320px" onclick="change(this)" id="koo" alt="logo"/>
</div>
或r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
语句中包装try:
,以确保要抓取的网站返回200响应,这可能是网站超时或无法满足您的请求。
答案 1 :(得分:0)
import os
from bs4 import BeautifulSoup
import urllib
import requests
import urlparse
from lxml.html import fromstring
r = requests.get("https://www.scoopwhoop.com/subreddit-nature/#.lce3tjfci")
data = r.text
soup = BeautifulSoup(data, "lxml")
for link in soup.find_all('img'):
image = link.get('src')
if bool(urlparse.urlparse(image).netloc):
print(image)
imageName = image[image.rfind("/")+1:]
print(imageName)
urllib.urlretrieve(image,imageName)