有没有办法让我的Python程序运行得更快?

时间:2011-12-30 16:43:31

标签: python performance

我已经在python中编写了一个计算机程序,但运行速度比我想要的慢得多。

以下是代码:

from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib
import urllib2
import cookielib

TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')

def download(url):
    print "Downloading:", url
    s = urllib2.urlopen(url).read()
    if s[:2] == '\x1f\x8b': # assume it's gzipped data
        ifh = GzipFile(mode='rb', fileobj=StringIO(s))
        s = ifh.read()
    print "Downloaded: ", url
    return s

def replace_chars(text, replacements):
    return ''.join(replacements.get(x,x) for x in text)

def handle_listing(listing_url):
    listing_document = BeautifulSoup(download(listing_url))

    # ignore pages that link to yellowpages
    if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
        listing_title = listing_document.title.text
        reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''}
        if TITLE_MATCH.match(listing_title) is not None:
            title, = TITLE_MATCH.match(listing_title).groups()
            address, = ADDRESS_MATCH.match(listing_title).groups()

            yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
                replace_chars(address, reps),
                replace_chars(title, reps),
            )

            yellow_page = BeautifulSoup(download(yellow_page_url))

            page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
            if page_url:
                page_url = page_url.a["href"]

                business_name = title[:title.index(",")]

                page = BeautifulSoup(download(page_url))
                yellow_page_address =  page.find("span", {"class" : "street-address"})
                if yellow_page_address:

                    if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
                        pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
                        page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})

                        final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
                                pid, page_escaped)
                        return final_url

def log_in(final_url):
    data = urllib.urlencode({"inUserName":"jacob.grannis@gmail.com", "inUserPass":"secretword"})
    jar = cookielib.FileCookieJar("cookies")
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
    opener.addheaders.append(('User-agent', 'Mozilla/4.0'))
    opener.addheaders.append(('Referer', 'http://www.locationary.com/'))
    opener.addheaders.append(('Cookie','site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.1.10.1325009956; __utmc=47547066'))
    opener.addheaders.append(('Cookie','Cookie: site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.4.10.1325009956; __utmc=47547066'))
    request = urllib2.Request("https://www.locationary.com/index.jsp?ACTION_TOKEN=tile_loginBar_jsp$JspView$LoginAction", data)
    response = opener.open(request) 
    url = str(final_url)
    anything = opener.open(url)
    page = anything.read()

States = [#'Alabama',
          #'Alaska',
          'Arizona',
          'Arkansas',
          'California',
          'Colorado',
          'Connecticut',
          'Delaware',
          'Florida',
          'Georgia',
          'Hawaii',
          'Idaho',
          'Illinois',
          'Indiana',
          'Iowa',
          'Kansas',
          'Kentucky',
          'Louisiana',
          'Maine',
          'Maryland',
          'Massachusetts',
          'Michigan',
          'Minnesota',
          'Mississippi',
          'Missouri',
          'Montana',
          'Nebraska',
          'Nevada',
          'New_Hampshire',
          'New_Jersey',
          'New_Mexico',
          'New_York',
          'North_Carolina',
          'North_Dakota',
          'Ohio',
          'Oklahoma',
          'Oregon',
          'Pennsylvania',
          'Rhode_Island',
          'South_Carolina',
          'South_Dakota',
          'Tennessee',
          'Texas',
          'Utah',
          'Vermont',
          'Virginia',
          'Washington',
          'West_Virginia',
          'Wisconsin',
          'Wyoming']

Cities = []

def find_cities(state):
    state_url = 'http://www.locationary.com/place/en/US/' + str(state)
    state_document = download(str(state_url))
    findCities = re.compile('<b>(.*)</b>')
    getCities = re.findall(findCities,state_document)

    for City in getCities:
        reps = {' ':'_'}
        City = replace_chars(City, reps)
        Cities.append(str(City))

bestworst = ['0','1']

def main():
    for state in States:
        find_cities(state)
        for city in Cities:
            for num in range(0,1):
                for pagenum in range(15,16):
                    print '------------------------------------------------------------------------------------------------------------------------------------------------------------'
                    print '------------------------------------------------------------------------------------------------------------------------------------------------------------'
                    if str(num) == '0':
                        print str(state) + ', ' + str(city) + ', ' + 'Best Profiles' + ', ' + 'Page ' + str(pagenum)
                    else:
                        print str(state) + ', ' + str(city) + ', ' + 'Worst Profiles' + ', ' + 'Page ' + str(pagenum)
                    START_URL = 'http://www.locationary.com/place/en/US/' + str(state) + '/' + city + '-page' + str(pagenum) + '/?ACTION_TOKEN=NumericAction&order=' + str(num)
                    pool = eventlet.GreenPool()
                    listings_document = BeautifulSoup(download(START_URL))
                    listings = listings_document.findAll("a", href = LOCATION_LISTING)
                    listings = [listing['href'] for listing in listings]

                    count_listings = 0

                    for final_url in pool.imap(handle_listing, listings):
                        print final_url
                        if final_url is not None:
                            log_in(final_url)

if __name__ == '__main__':
    main()

有没有办法让它更快或不可能?它必须从互联网上下载很多网址,但我很确定我不能使我的互联网连接速度比现在快10到50倍......而且我的电脑不是很慢......所以,是有没有办法让我的程序,比如说快10-50倍?我知道这可能听起来很荒谬,但专业程序员如何让他们的程序更快呢?

2 个答案:

答案 0 :(得分:6)

加快任何计划的第一步是了解为什么它的速度很慢 - 即时间在哪里?用于执行此操作的工具程序员称为 profiler 。标准Python包括以下几种:您可以了解它们here.

一旦学会了使用分析器,请在程序中运行它以识别热点或程序花费最多时间的位置。然后尝试以两种方式之一加速程序:

  1. 尽量让热点花费更少的时间;或
  2. 尝试使热点执行次数少。
  3. 通常#2更有成效。选择更好或更合适的算法可以减少执行的代码量。

    不要浪费时间猜测为什么程序很慢; 衡量它,然后投入精力解决实际问题。程序员在猜测性能问题所在的位置上是出了名的不好。

答案 1 :(得分:2)

程序员优化代码的方式是使用分析器,python使几个可用。 Here is a great article 让你开始。

您可以从命令行调用timeit

python -m timeit myprogram.py

上面的链接有很多使用timeit的例子。一旦弄清楚瓶颈在哪里,就可以考虑修复它们的方法。如果你的程序在download()函数中花费了过多的时间,你可以考虑在后台引入某种并发和下载的东西,同时你的程序继续使用BeautifulSoup来解析已经过的东西的提取信息。下载。

这里的关键是看:

  1. 您的计划花费最多的时间。
  2. 步骤1中的位置,您可以在其中优化最简单的
  3. 对于一个假设的例子,如果您的正则表达式编写得特别糟糕,那么它们可能需要很长时间,然后您可以进行优化。我说“假设”,因为在实践中,你的正则表达式不太可能是一个重要的瓶颈,除非你执行它们数百万次或类似奇怪的东西。