我对数据抓取不是很熟悉,也无法使用漂亮的汤来下载图像。
我需要从网站下载所有图像。我在下面使用代码:
import re
import requests
from bs4 import BeautifulSoup
site = 'http://someurl.org/'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
# img_tags = soup.findAll('img')
img_tags = soup.findAll('img',{"src":True})
print('img_tags: ')
print(img_tags)
urls = [img['src'] for img in img_tags]
print('urls: ')
print(urls)
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
但是,这会忽略页面上显示的所有具有类似html的图像:
<img data-bind="attr: { src: thumbURL() }" src="/assets/images/submissions/abfc-2345345234.thumb.png">
我认为是因为data属性还包含字符串“ src”,但我似乎无法弄清楚。
答案 0 :(得分:1)
您需要使用硒或可以运行javascript的硒。这是直到找到它的代码加载图像
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
site = 'http://phylopic.org/'
dr = webdriver.Chrome()
dr.get(site)
try:
element = WebDriverWait(dr, 20, 0.5).until(
EC.visibility_of_element_located((By.CLASS_NAME, "span1"))
)
except:
print("Wait a bit more")
time.sleep(5)
text = dr.page_source
soup = BeautifulSoup(text,"lxml")
imgs = soup.find_all('img')
print(imgs)
dr.close()
第二个问题是如何将相对路径转换为绝对路径。 HTML
上的relative path有几种类型。
网址为http://someurl.org/somefd/somefd2
<img src="picture.jpg"> http://someurl.org/somefd/somefd2/picture.jpg
<img src="images/picture.jpg"> http://someurl.org/somefd/somefd2/images/picture.jpg
<img src="/images/picture.jpg"> http://someurl.org/images/picture.jpg
<img src="../picture.jpg"> http://someurl.org/somefd/picture.jpg
这是我将rp转换为ap的代码。
import re
site = 'https://en.wikipedia.org/wiki/IMAGE'
def r2a(path,site=site):
rp = re.findall(r"(/?\W{2}\/)+?",path)
if path.find("http") == 0:
#full http url
return path
elif path.find("//") == 0:
#http url lack of http:
return "http:" + path
elif path.find("//") < 0 and path.find("/") == 0:
# located in the folder at the root of the current web
site_root = re.findall("http.{3,4}[^/]+",site)
return site_root[0] + path
elif rp:
# located in the folder one level up from the current folder
sitep = len(re.findall(r"([^/]+)+",site)) - 2 - len(rp)
# raise error when sitep-len(rp)
new_path = re.findall("(http.{4}[^/]+)(/[^/]+){%d}"%(sitep),site)
return "{}/{}".format("".join(new_path[0]),path.replace( "".join(rp) , ""))
else:
# located in the folder one level up from the current folder
# located in the same folder as the current page
return "{}/{}".format(site,path)
assert "https://en.wikipedia.org/wiki/IMAGE/a.jpg" == r2a("a.jpg")
assert "https://en.wikipedia.org/wiki/IMAGE/unknow/a.jpg" == r2a("unknow/a.jpg")
assert "https://en.wikipedia.org/unknow/a.jpg" == r2a("/unknow/a.jpg")
assert "https://en.wikipedia.org/wiki/a.jpg" == r2a("../a.jpg")
assert "https://en.wikipedia.org/a.jpg" == r2a("../../a.jpg")
assert "https://en.wikipedia.org/wiki/IMAGE/a.jpg" == r2a("https://en.wikipedia.org/wiki/IMAGE/a.jpg")
assert "http://en.wikipedia.org/" == r2a("//en.wikipedia.org/")