我有下面的代码,它会下载图片,但我想要的是能够获取图片网址并将其导出到文件中。你能指出我正确的方向吗?
try:
from urlparse import urljoin
except ImportError:
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
class Scraper:
def __init__(self):
self.visited = set()
self.session = requests.Session()
self.session.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36"}
requests.packages.urllib3.disable_warnings() # turn off SSL warnings
def visit_url(self, url, level):
print(url)
if url in self.visited:
return
self.visited.add(url)
content = self.session.get(url, verify=False).content
soup = BeautifulSoup(content, "lxml")
for img in soup.select("img[src]"):
image_url = img["src"]
答案 0 :(得分:0)
但您提供的代码不会下载图片。它下载一个html文件并提取图像网址,但代码对提取的图像网址没有任何作用。
要使Scraper.visit_url
打印图片网址,您只需要更改最后一行。你在这里:
try:
from urlparse import urljoin
except ImportError:
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
class Scraper:
def __init__(self):
self.visited = set()
self.session = requests.Session()
self.session.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36"}
requests.packages.urllib3.disable_warnings() # turn off SSL warnings
def visit_url(self, url, level):
print(url)
if url in self.visited:
return
self.visited.add(url)
content = self.session.get(url, verify=False).content
soup = BeautifulSoup(content, "lxml")
for img in soup.select("img[src]"):
print(img["src"])