from bs4 import *
import requests
import re
import os
site = "https://woodme.dk/"
r2 = requests.get(site)
soup2 = BeautifulSoup(r2.text, "html.parser")
img_tags = soup2.find_all("img")
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
os.mkdir("Gathered_Photos")
i = 1
for index, img_link in enumerate(urls):
if i <= 10:
img_data = requests.get(img_link).content
with open("Gathered_Photos/" + str(index + 1) + '.jpg', 'wb+') as f:
f.write(img_data)
i += 1
else:
f.close()
break
我正在尝试制作一个简单的网络爬虫,以收集网站上的所有图像,但是当我运行时,我没有收到任何错误,但没有输出,或者它只是运行,然后什么也不做?
答案 0 :(得分:0)
您的请求被网站阻止:
site = "https://woodme.dk/"
r2 = requests.get(site)
print(r2.text)
显示
<html><head><title>406 Security Incident Detected</title></head><body><h1>406 Security Incident Detected</h1><p>Your request was blocked. Please try again later (or don't).</p><hr>Xm-qYrBRS5ojKl5V1cxeKgAAABE</body></html>
woodme.dk似乎制定了安全规则以防止爬网。