Question

我从GAE中的处理程序调用Web爬网函数，它会检索一些图像然后显示它们。它在第一次调用时工作正常，但下次显示所有相同的图像时，爬虫从最后一次停止的地方开始。我认为我的全局变量没有被正确重置是个问题。

每次我重新部署应用程序时，它都是第一次正确执行，但问题就开始了。

这是我的代码，如果您需要我澄清它，请告诉我，但我认为这应该是有道理的。

这是刮刀功能

visited_pages = []
visit_queue = deque([])
collected_pages = []
collected_pics = []
count = 0
pic_count = 0

def scrape_pages(url, root_url, keywords=[], recurse=True):
    #variables
    max_count = 16
    pic_num = 100

    global count
    global pic_count
    global collected_pics
    global collected_pages

    print 'the keywords and url are'
    print keywords
    print url

    #this is all of the links that have been scraped
    the_links = []

    soup = soupify_url(url)

    #only add new pages onto the queue if the recursion argument is true    
    if recurse:
        #find all the links on the page
        try:
            for tag in soup.findAll('a'):
                the_links.append(tag.get('href'))
        except AttributeError:
            return

        try:
            external_links, internal_links, root_links, primary_links = categorize_links(the_links, url, root_url)
        except TypeError:
            return


        #change it so this depends on the input
        links_to_visit = external_links + internal_links + root_links

        #build the queue
        for link in links_to_visit:
            if link not in visited_pages and link not in visit_queue:
                visit_queue.append(link)

    visited_pages.append(url)
    count = count + 1
#    print 'number of pages visited'
#    print count

    #add pages to collected_pages depending on the criteria given if any keywords are given
    if keywords:
        page_to_add = find_pages(url, soup, keywords)

#        print 'page to add'
#        print page_to_add
        if page_to_add and page_to_add not in collected_pages:
            collected_pages.append(page_to_add)


    pics_to_add = add_pics(url, soup)
#    print 'pics to add'
#    print pics_to_add
    if pics_to_add:
        collected_pics.extend(pics_to_add)

    #here is where the actual recursion happens by finishing the queue
    while visit_queue:
        if count >= max_count:
            return

        if pic_count > pic_num:
            return

        link = visit_queue.popleft()
#        print link
        scrape_pages(link, root_url, keywords)

#    print '***done***'
    ###done with the recursive scraping function here

#here I just get a list of links from Bing, add them to the queue and go through them then reset all the global variables
def scrape_bing_src(keywords):
    visit_queue, the_url = scrape_bing.get_links(keywords, a_list = False)
    scrape_pages(visit_queue.popleft(), the_url, keywords, recurse=True)

    global collected_pics
    global pic_count
    global count
    global visited_pages
    global visit_queue

    pic_count = 0
    count = 0
    visited_pages = []
    visit_queue = deque([])

    pics_to_return = collected_pics
    collected_pics = []
    return pics_to_return

这是调用scraper函数的处理程序

#this just simply displays the images
class Try(BlogHandler):
    def get(self, keyword):
        keyword = str(keyword)
        keyword_list = keyword.split()
        img_list = scraper.scrape_bing_src(keyword_list)

        for img in img_list:
            self.response.write("""<br><img src='""" + img + """'>""")

        self.response.write('we are done here')

Answer 1

您的代码不仅仅在一个“服务器”和一个实例中运行，您可能已经注意到管理控制台中的实例选项卡。因此，即使在呼叫之间，您将被切换到不同的服务器，或者进程将被“重新启动”（您可以阅读更多here）。在预热过程中，您的应用程序从磁盘读入内存，然后开始处理请求。因此，每次使用自己的全局变量值获取新的预先缓存的python实例时。

在您的情况下，最好使用memcache。

全局变量重置无法在Google App Engine中使用

1 个答案: