Question

这是我用于从Google页面下载图片的代码。此代码在评估和下载图像方面花费了大量时间。因此，我考虑使用Beautifulsoup库来加快评估和下载速度。检查以下原始代码：

import time       
import sys    
import os
import urllib2



search_keyword = ['Australia']


keywords = [' high resolution']


def download_page(url):
    import urllib2
    try:
        headers = {}
        headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
        req = urllib2.Request(url, headers = headers)
        response = urllib2.urlopen(req)
        page = response.read()
        return page
    except:
        return"Page Not found"



def _images_get_next_item(s):
    start_line = s.find('rg_di')
    if start_line == -1:    
        end_quote = 0
        link = "no_links"
        return link, end_quote
    else:
        start_line = s.find('"class="rg_meta"')
        start_content = s.find('"ou"',start_line+1)
        end_content = s.find(',"ow"',start_content+1)
        content_raw = str(s[start_content+6:end_content-1])
        return content_raw, end_content



def _images_get_all_items(page):
    items = []
    while True:
        item, end_content = _images_get_next_item(page)
        if item == "no_links":
            break
        else:
            items.append(item)      
            time.sleep(0.1)        
            page = page[end_content:]
    return items



t0 = time.time()   


i= 0
while i<len(search_keyword):
    items = []
    iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
    print (iteration)
    print ("Evaluating...")
    search_keywords = search_keyword[i]
    search = search_keywords.replace(' ','%20')


    try:
        os.makedirs(search_keywords)
    except OSError, e:
        if e.errno != 17:
            raise   

        pass

    j = 0
    while j<len(keywords):
        pure_keyword = keywords[j].replace(' ','%20')
        url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
        raw_html =  (download_page(url))
        time.sleep(0.1)
        items = items + (_images_get_all_items(raw_html))
        j = j + 1

    print ("Total Image Links = "+str(len(items)))
    print ("\n")



    info = open('output.txt', 'a')        
    info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n")         
    info.close()                            

    t1 = time.time()    
    total_time = t1-t0   
    print("Total time taken: "+str(total_time)+" Seconds")
    print ("Starting Download...")




    k=0
    errorCount=0
    while(k<len(items)):
        from urllib2 import Request,urlopen
        from urllib2 import URLError, HTTPError

        try:
            req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
            response = urlopen(req,None,15)
            output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')

            data = response.read()
            output_file.write(data)
            response.close();

            print("completed ====> "+str(k+1))

            k=k+1;

        except IOError:   

            errorCount+=1
            print("IOError on image "+str(k+1))
            k=k+1;

        except HTTPError as e:  

            errorCount+=1
            print("HTTPError"+str(k))
            k=k+1;
        except URLError as e:

            errorCount+=1
            print("URLError "+str(k))
            k=k+1;

    i = i+1

print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")

我认为编辑以下代码有助于使代码与BeautifulSoup库一起使用，我的工作将更快完成：

def download_page(url):
        import urllib2
        try:
            headers = {}
            headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
            req = urllib2.Request(url, headers = headers)
            #response = urllib2.urlopen(req)
            #page = response.read()
            return BeautifulSoup(urlopen(Request(req)), 'html.parser')
        except:
            return"Page Not found"

但上面的代码返回空白。请允许，让我知道我可以做些什么来使BeautifulSoup毫无困难地使代码工作得非常好。

Answer 1

您无法像这样传递Google标题。搜索引擎比简单地将一些关键字替换为GET URL更复杂。

HTML是一种标记语言，仅对单向渲染人类可读信息有用。对于您的应用程序，您需要机器可读标记，而不是尝试破译人类可读文本。谷歌已经拥有一个非常全面的API https://developers.google.com/custom-search/，它易于使用，并且比使用BeautifulSoup更好的实现方法

Python2 BeautifulSoup返回Blank输出

1 个答案: