Question

我制作了这个愚蠢的小应用程序，使用twitters api并扫描最后的'x'推文，找到微小的URL，找出微小URL指向的URL，累积已被推文的顶级域名的频率关于，并根据频率输出一个html tagcloud页面。

但它不能正常工作。我会解决一些解析问题。我不认为那是我最关心的。我真正想要解决的是两件事：

有时应用程序崩溃（特别是如果我选择大量的推文进行扫描），通常是“Downloaderror Applicationerror 2”和“Downloaderror Applicationerror 5”。我无法解决的另一个问题是运行所需的时间......它很慢。我尝试设置短暂的超时。但是，如果我扫描很多推文，仍然需要FOREVER运行。

有什么想法吗？谢谢！

import logging
import wsgiref.handlers
from google.appengine.ext import webapp
import urllib2
from urllib import urlencode
from urllib2 import urlopen
from BeautifulSoup import BeautifulStoneSoup
import socket
import re
from urlparse import urlparse
from google.appengine.api import urlfetch
#from google.appengine.api.urlfetch import DownloadError

#timeout = 3
#socket.setdefaulttimeout(timeout)

class Link():
def __init__(self, a, b):
    self.link = a
    self.number = b

def __str__(self):
    return "%s ; %s" % (self.link, self.number)

def getFeed(i):
    r = urlopen('http://search.twitter.com/search.atom?q=twitter&since=2010-02-28&rpp=100&page=%i' %(i))
    return r

def processFeed(f):

    soup = BeautifulStoneSoup(f.read(),selfClosingTags=["link"])
    tweets = []
    final = {}
    k = 0
    j = 0

    for entry in soup.findAll("entry"):
        title = entry.find('title').contents[0]
        if 'http' in title:
            temp = re.search("(?P<url>https?://[^\s]+)", title).group("url")
            tweets.append(Link(temp,0))

#The for loop below takes care of good urls (yahoo.com), non-sense url (http://asdfaf,     http://blah.blah), pages not found (http://google.com/tuff).
#BUT...there are certain response from the host server that just totally crashes the     program.
#Downloaderror Applicationerror 5 is a timeout error and Downloaderror Applicationerror     2 is also a connection error
    for address in tweets:
    #address.link = address.link.strip()
    try:
        response = urllib2.urlopen(address.link)
        #response = urlfetch.fetch(address.link, method=urlfetch.HEAD, deadline=10)
        #url_destination = response.final_url
        url_destination = response.url
        address.link = url_destination
        j = j + 1
    except urllib2.URLError:
        pass
    except urllib2.HTTPError:
        pass
    except UnicodeDecodeError:
        pass

while k < j:
    o = urlparse(tweets[k].link)
    tweets[k].link = o.netloc
    k = k + 1

for link in tweets:
    temp = link.link.split('.')
    temp[len(temp)-1] = temp[len(temp)-1][0:3]
    temp = [temp[len(temp)-2],temp[len(temp)-1]]
    link.link = '.'.join(temp)
    if link.link in final:
        final[link.link] += 1
    else:
        final[link.link] = 1
    return final

def TagCloudDivHeader(txt):
    return "<div class = 'tagcloud'>\n<div class = 'tagcloudtitle'>%s</div>\n" % txt

def TagCloudDivFooter():
    return "</div>\n"


def size(freq):
    return freq           

def writeTerm(term,freq):
    return " <span class='term' style='font-size:"+str(size(freq))+"em'>" + "<a href = 'http://%s'>" %term + term.encode('ISO-8859-1', 'replace') + "</a></span> "+ "\n"

def genForm(prompt = ""):

    numberoftweets = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    res = ""
    if prompt:
        res += "<div class= 'formtitle'>%s</div>" % (prompt)
    res += """<form action="index.py" method="post">"""
    res +="""<label for="Tweets">Number of Tweets to scan:</label>
        <select id="Tweets" name="Tweets">"""   
    for n in numberoftweets:
        res += "<option value = \"%i\">%i</option>" %(n*100,n*100)
    res += "</select>"
    res += '<input type="submit" value="Go" name="gobtn"/> </form>'
    res += "</br>WARNING!!!! The fewer Tweets you scan, the more stable this program is!!!!"
    return res


def makeTagCloud(cloudtitle, items):
    result = ''
    result += TagCloudDivHeader(cloudtitle)
    for thing in items:
        result += writeTerm(thing, items[thing])
    result += TagCloudDivFooter()
    result += HTMLFooter()
    return result

def HTMLHeader(pageheader = ""):
    s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">"
    s += "\n<html><head>\n <title>%s</title>\n" % pageheader
    s = s + "<link rel='stylesheet' href='/assets/mystyles.css' type='text/css' />\n"
    s = s + "</head>\n"
    s = s + "<body>\n"
    return s

def HTMLFooter():
    return "</body>\n</html>"   

def generateLinks():

    result = ""
    result += HTMLHeader("Who's getting the most traffic from Twitter?")    
    result += "<p>" + "<a href = 'results/'>Proceed?</a>" + "</p>\n"        
    result += HTMLFooter()
    return result

class MainHandler(webapp.RequestHandler):

    def get(self):
        self.response.headers['Content-Type'] = 'text/html'
        path = self.request.path
        logging.info("path is " +  path)
        form = genForm()

        contents = generateLinks()
        self.response.out.write(HTMLHeader("Who's getting the most traffic from Twitter?"))
        self.response.out.write(form)
        self.response.out.write(HTMLFooter())

    def post(self):
        self.response.out.write("Where are links are Twitter taking you?")
        self.response.out.write(HTMLHeader("Domain cloud for Twitter Tweets"))
        tweets = int(self.request.get('Tweets'))
        tweets = int(tweets/100)
        self.response.out.write(makeTagCloud("Domains most linked to by Tweets", processFeed(getFeed(tweets))))

def main():
    application = webapp.WSGIApplication([('/.*', MainHandler)],debug=True)
    wsgiref.handlers.CGIHandler().run(application)

if __name__ == '__main__':
  main()

Answer 1

我不确定崩溃，但是尝试将每个请求放在自己的线程中 - 假设webapp.RequestHandler是线程安全的并且在IO上没有阻塞，这可以节省你很多时间，因为你可以处理很多请求同时进行。像这样的IO绑定情况是线程最有用的情况之一，因为它不需要任何魔法来支持GIL。

有关线程的信息，请查看the manual。它需要一些工作，但对于这种应用程序来说它应该是值得的;它将为您介绍一个相当有趣的高级编程（通过一些定义）练习。我建议使用线程而不是多处理，因为它为你处理锁定等问题，这看起来不像是一个CPU限制的问题（python中的全局解释器锁阻止线程对纯-thon中编写的CPU绑定问题有用））。

作为提示，您可能希望查看事件循环，队列（使用deques实现的esp）和父线程的计时器。尽量避免使用以下那种浪费的循环：

finished = False
while not finished:
  x = [thread.poll() for thread in threads]
  finished = None in x

而是尝试类似：

timer.start(1)
finished = False
while not finished:
  x = [thread.poll() for thread in threads]
  finished = None in x
  timer.wait()

定时器等待直到下一个这么多秒等等。

python，谷歌应用引擎和推特：我的程序无法正常工作

1 个答案: