我从GAE中的处理程序调用Web爬网函数,它会检索一些图像然后显示它们。它在第一次调用时工作正常,但下次显示所有相同的图像时,爬虫从最后一次停止的地方开始。我认为我的全局变量没有被正确重置是个问题。
每次我重新部署应用程序时,它都是第一次正确执行,但问题就开始了。
这是我的代码,如果您需要我澄清它,请告诉我,但我认为这应该是有道理的。
这是刮刀功能
visited_pages = []
visit_queue = deque([])
collected_pages = []
collected_pics = []
count = 0
pic_count = 0
def scrape_pages(url, root_url, keywords=[], recurse=True):
#variables
max_count = 16
pic_num = 100
global count
global pic_count
global collected_pics
global collected_pages
print 'the keywords and url are'
print keywords
print url
#this is all of the links that have been scraped
the_links = []
soup = soupify_url(url)
#only add new pages onto the queue if the recursion argument is true
if recurse:
#find all the links on the page
try:
for tag in soup.findAll('a'):
the_links.append(tag.get('href'))
except AttributeError:
return
try:
external_links, internal_links, root_links, primary_links = categorize_links(the_links, url, root_url)
except TypeError:
return
#change it so this depends on the input
links_to_visit = external_links + internal_links + root_links
#build the queue
for link in links_to_visit:
if link not in visited_pages and link not in visit_queue:
visit_queue.append(link)
visited_pages.append(url)
count = count + 1
# print 'number of pages visited'
# print count
#add pages to collected_pages depending on the criteria given if any keywords are given
if keywords:
page_to_add = find_pages(url, soup, keywords)
# print 'page to add'
# print page_to_add
if page_to_add and page_to_add not in collected_pages:
collected_pages.append(page_to_add)
pics_to_add = add_pics(url, soup)
# print 'pics to add'
# print pics_to_add
if pics_to_add:
collected_pics.extend(pics_to_add)
#here is where the actual recursion happens by finishing the queue
while visit_queue:
if count >= max_count:
return
if pic_count > pic_num:
return
link = visit_queue.popleft()
# print link
scrape_pages(link, root_url, keywords)
# print '***done***'
###done with the recursive scraping function here
#here I just get a list of links from Bing, add them to the queue and go through them then reset all the global variables
def scrape_bing_src(keywords):
visit_queue, the_url = scrape_bing.get_links(keywords, a_list = False)
scrape_pages(visit_queue.popleft(), the_url, keywords, recurse=True)
global collected_pics
global pic_count
global count
global visited_pages
global visit_queue
pic_count = 0
count = 0
visited_pages = []
visit_queue = deque([])
pics_to_return = collected_pics
collected_pics = []
return pics_to_return
这是调用scraper函数的处理程序
#this just simply displays the images
class Try(BlogHandler):
def get(self, keyword):
keyword = str(keyword)
keyword_list = keyword.split()
img_list = scraper.scrape_bing_src(keyword_list)
for img in img_list:
self.response.write("""<br><img src='""" + img + """'>""")
self.response.write('we are done here')