我目前正在创建图像抓取工具,以从IMDB搜索结果中提取缩略图。我能够隔离特定.jpg的URL,但无法将响应对象传递给get函数。
# Essential variables and databases
repository = []
search_results = []
# Function to itemize URL search parameters
def source_identity():
valid_input = False
while not valid_input:
try:
source = Path(input("Please enter a file path: "))
valid_input = True
for files in os.listdir(source):
repository.append(str(files))
except FileNotFoundError as err: print(err) + source_identity()
def image_search():
try:
if repository:
search_url = ("http://www.imdb.com/find?ref_=nv_sr_fn&q=" +
str(repository[0].replace(" ","+").replace("(","").replace(")","") +
"&s=all"))
data = requests.get(search_url)
data.raise_for_status()
search_targets = open("search_targets.txt", "wb")
for chunk in data.iter_content(100000):
search_targets.write(chunk)
print(search_url)
soup = bs4.BeautifulSoup(data.text, "html.parser")
thumbnail = soup.select(".primary_photo a:nth-of-type(1)")
print(thumbnail)
if not thumbnail:
return "Could not find image"
else:
for p in thumbnail:
y = [tag["src"] for tag in p.findAll("img")]
print(type(y))
print(y)
img_url = "http:" + str(thumbnail[0].get(str(y)))
print("Downloading image %s..." % img_url)
res = requests.get(img_url)
res.raise_for_status()
else:
return print("in return")
except FileExistsError as err:
print(err) + image_search()
except Exception as err:
print(err)
此输出:
Please enter a file path: D:\Movies
http://www.imdb.com/find?ref_=nv_sr_fn&q=2+Guns+2013&s=all
<class 'list'> 'https://m.mediaamazon.com/images/M/MV5BNTQ5MTgzNDg4OF5BMl5BanBnXkFtZTcwMjAyODEzOQ@@._V1_UX32_CR0,0,32,44_AL_.jpg'
Downloading image http:None...
Failed to parse: http:None
Process finished with exit code 0
我尝试了几种提取src url的方法,并不断提出各种我无法解决的错误