这是我用于从Google页面下载图片的代码。此代码在评估和下载图像方面花费了大量时间。因此,我考虑使用Beautifulsoup
库来加快评估和下载速度。检查以下原始代码:
import time
import sys
import os
import urllib2
search_keyword = ['Australia']
keywords = [' high resolution']
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1:
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
def _images_get_all_items(page):
items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
items.append(item)
time.sleep(0.1)
page = page[end_content:]
return items
t0 = time.time()
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
print ("Evaluating...")
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
try:
os.makedirs(search_keywords)
except OSError, e:
if e.errno != 17:
raise
pass
j = 0
while j<len(keywords):
pure_keyword = keywords[j].replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
time.sleep(0.1)
items = items + (_images_get_all_items(raw_html))
j = j + 1
print ("Total Image Links = "+str(len(items)))
print ("\n")
info = open('output.txt', 'a')
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n")
info.close()
t1 = time.time()
total_time = t1-t0
print("Total time taken: "+str(total_time)+" Seconds")
print ("Starting Download...")
k=0
errorCount=0
while(k<len(items)):
from urllib2 import Request,urlopen
from urllib2 import URLError, HTTPError
try:
req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urlopen(req,None,15)
output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')
data = response.read()
output_file.write(data)
response.close();
print("completed ====> "+str(k+1))
k=k+1;
except IOError:
errorCount+=1
print("IOError on image "+str(k+1))
k=k+1;
except HTTPError as e:
errorCount+=1
print("HTTPError"+str(k))
k=k+1;
except URLError as e:
errorCount+=1
print("URLError "+str(k))
k=k+1;
i = i+1
print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")
我认为编辑以下代码有助于使代码与BeautifulSoup
库一起使用,我的工作将更快完成:
def download_page(url):
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
#response = urllib2.urlopen(req)
#page = response.read()
return BeautifulSoup(urlopen(Request(req)), 'html.parser')
except:
return"Page Not found"
但上面的代码返回空白。请允许,让我知道我可以做些什么来使BeautifulSoup
毫无困难地使代码工作得非常好。
答案 0 :(得分:0)
您无法像这样传递Google标题。搜索引擎比简单地将一些关键字替换为GET URL更复杂。
HTML是一种标记语言,仅对单向渲染人类可读信息有用。对于您的应用程序,您需要机器可读标记,而不是尝试破译人类可读文本。谷歌已经拥有一个非常全面的API https://developers.google.com/custom-search/,它易于使用,并且比使用BeautifulSoup更好的实现方法