从多处理转向线程

时间:2012-05-21 15:10:15

标签: python multithreading multiprocessing

在我的项目中,我使用multiprocessing类来并行运行任务。我想使用threading,因为它具有更好的性能(我的任务是TCP / IP绑定,而不是CPU或I / O绑定)。

multiprocessing具有精彩的功能,Pool.imap_unorderedPool.map_asyncthreading类中不存在。

将代码转换为使用threading的正确方法是什么?该文档介绍了multiprocessing.dummy类,它是threading类的包装器。然而,这会引发很多错误(至少在python 2.7.3上):

    pool = multiprocessing.Pool(processes)
  File "C:\python27\lib\multiprocessing\dummy\__init__.py", line 150, in Pool
    return ThreadPool(processes, initializer, initargs)
  File "C:\python27\lib\multiprocessing\pool.py", line 685, in __init__
    Pool.__init__(self, processes, initializer, initargs)
  File "C:\python27\lib\multiprocessing\pool.py", line 136, in __init__
    self._repopulate_pool()
  File "C:\python27\lib\multiprocessing\pool.py", line 199, in _repopulate_pool
    w.start()
  File "C:\python27\lib\multiprocessing\dummy\__init__.py", line 73, in start
    self._parent._children[self] = None
AttributeError: '_DummyThread' object has no attribute '_children'

编辑:实际发生的是我有一个运行不同线程的GUI(以防止GUI被gettint卡住)。该线程运行具有失败的ThreadPool的特定搜索功能。

编辑2:错误修正was fixed并将包含在将来的版本中。 很高兴看到一个crasher修复!

import urllib2, htmllib, formatter
import multiprocessing.dummy as multiprocessing
import xml.dom.minidom
import os
import string, random
from urlparse import parse_qs, urlparse

from useful_util import retry
import config
from logger import log

class LinksExtractor(htmllib.HTMLParser):
    def __init__(self, formatter):
        htmllib.HTMLParser.__init__(self, formatter)
        self.links = []
        self.ignoredSites = config.WebParser_ignoredSites

    def start_a(self, attrs):
        for attr in attrs:
            if attr[0] == "href" and attr[1].endswith(".mp3"):
                if not filter(lambda x: (x in attr[1]), self.ignoredSites):
                    self.links.append(attr[1])

    def get_links(self):
        return self.links


def GetLinks(url, returnMetaUrlObj=False):
    '''
    Function gather links from a url.
    @param url: Url Address.
    @param returnMetaUrlObj: If true, returns a MetaUrl Object list.
                             Else, returns a string list. Default is False.

    @return links: Look up.
    '''
    htmlparser = LinksExtractor(formatter.NullFormatter())

    try:
        data = urllib2.urlopen(url)
    except (urllib2.HTTPError, urllib2.URLError) as e:
        log.error(e)
        return []
    htmlparser.feed(data.read())
    htmlparser.close()

    links = list(set(htmlparser.get_links()))

    if returnMetaUrlObj:
        links = map(MetaUrl, links)

    return links

def isAscii(s):
    "Function checks is the string is ascii."
    try:
        s.decode('ascii')
    except (UnicodeEncodeError, UnicodeDecodeError):
        return False
    return True

@retry(Exception, logger=log)
def parse(song, source):
    '''
    Function parses the source search page and returns the .mp3 links in it.
    @param song: Search string.
    @param source: Search website source. Value can be dilandau, mp3skull, youtube, seekasong.

    @return links: .mp3 url links.
    '''
    source = source.lower()
    if source == "dilandau":
        return parse_dilandau(song)
    elif source == "mp3skull":
        return parse_Mp3skull(song)
    elif source == "SeekASong":
        return parse_SeekASong(song)
    elif source == "youtube":
        return parse_Youtube(song)

    log.error('no source "%s". (from parse function in WebParser)')
    return []

def parse_dilandau(song, pages=1):
    "Function connects to Dilandau.eu and returns the .mp3 links in it"
    if not isAscii(song): # Dilandau doesn't like unicode.
        log.warning("Song is not ASCII. Skipping on dilandau")
        return []

    links = []
    song = urllib2.quote(song.encode("utf8"))

    for i in range(pages):
        url = 'http://en.dilandau.eu/download_music/%s-%d.html' % (song.replace('-','').replace(' ','-').replace('--','-').lower(),i+1)
        log.debug("[Dilandau] Parsing %s... " % url)
        links.extend(GetLinks(url, returnMetaUrlObj=True))
    log.debug("[Dilandau] found %d links" % len(links))

    for metaUrl in links:
        metaUrl.source = "Dilandau"

    return links

def parse_Mp3skull(song, pages=1):
    "Function connects to mp3skull.com and returns the .mp3 links in it"
    links = []
    song = urllib2.quote(song.encode("utf8"))

    for i in range(pages):
        # http://mp3skull.com/mp3/how_i_met_your_mother.html
        url = 'http://mp3skull.com/mp3/%s.html' % (song.replace('-','').replace(' ','_').replace('__','_').lower())
        log.debug("[Mp3skull] Parsing %s... " % url)
        links.extend(GetLinks(url, returnMetaUrlObj=True))
    log.debug("[Mp3skull] found %d links" % len(links))

    for metaUrl in links:
        metaUrl.source = "Mp3skull"

    return links

def parse_SeekASong(song):
    "Function connects to seekasong.com and returns the .mp3 links in it"
    song = urllib2.quote(song.encode("utf8"))

    url = 'http://www.seekasong.com/mp3/%s.html' % (song.replace('-','').replace(' ','_').replace('__','_').lower())
    log.debug("[SeekASong] Parsing %s... " % url)
    links = GetLinks(url, returnMetaUrlObj=True)
    for metaUrl in links:
        metaUrl.source = "SeekASong"
    log.debug("[SeekASong] found %d links" % len(links))

    return links

def parse_Youtube(song, amount=10):
    '''
    Function searches a song in youtube.com and returns the clips in it using Youtube API.
    @param song: The search string.
    @param amount: Amount of clips to obtain.

    @return links: List of links.
    '''
    "Function connects to youtube.com and returns the .mp3 links in it"
    song = urllib2.quote(song.encode("utf8"))
    url = r"http://gdata.youtube.com/feeds/api/videos?q=%s&max-results=%d&v=2" % (song.replace(' ', '+'), amount)
    urlObj = urllib2.urlopen(url, timeout=4)
    data = urlObj.read()
    videos = xml.dom.minidom.parseString(data).getElementsByTagName('feed')[0].getElementsByTagName('entry')

    links = []
    for video in videos:
        youtube_watchurl = video.getElementsByTagName('link')[0].attributes.item(0).value
        links.append(get_youtube_hightest_quality_link(youtube_watchurl))

    return links

def get_youtube_hightest_quality_link(youtube_watchurl, priority=config.youtube_quality_priority):
    '''
    Function returns the highest quality link for a specific youtube clip.
    @param youtube_watchurl: The Youtube Watch Url.
    @param priority: A list represents the qualities priority.

    @return MetaUrlObj: MetaUrl Object.
    '''
    video_id = parse_qs(urlparse(youtube_watchurl).query)['v'][0]
    youtube_embedded_watchurl = "http://www.youtube.com/embed/%s?autoplay=1" % video_id

    d = get_youtube_dl_links(video_id)
    for x in priority:
        if x in d.keys():
            return MetaUrl(d[x][0], 'youtube', d['VideoName'], x, youtube_embedded_watchurl)
    log.error("No Youtube link has been found in get_youtube_hightest_quality_link.")
    return ""

@retry(Exception, logger=log)
def get_youtube_dl_links(video_id):
    '''
    Function gets the download links for a youtube clip.
    This function parses the get_video_info format of youtube.

    @param video_id: Youtube Video ID.
    @return d: A dictonary of qualities as keys and urls as values.
    '''
    d = {}

    url = r"http://www.youtube.com/get_video_info?video_id=%s&el=vevo" % video_id

    urlObj = urllib2.urlopen(url, timeout=12)
    data = urlObj.read()
    data = urllib2.unquote(urllib2.unquote(urllib2.unquote(data)))
    data = data.replace(',url', '\nurl')
    data = data.split('\n')

    for line in data:
        if 'timedtext' in line or 'status=fail' in line or '<AdBreaks>' in line:
            continue

        try:
            url = line.split('&quality=')[0].split('url=')[1]
            quality = line.split('&quality=')[1].split('&')[0]
        except:
            continue
        if quality in d:
            d[quality].append(url)
        else:
            d[quality] = [url]

    try:
        videoName = "|".join(data).split('&title=')[1].split('&')[0]
    except Exception, e:
        log.error("Could not parse VideoName out of get_video_info (%s)" % str(e))
        videoName = ""

    videoName = unicode(videoName, 'utf-8')
    d['VideoName'] = videoName.replace('+',' ').replace('--','-')
    return d


class NextList(object):
    "A list with a 'next' method."
    def __init__(self, l):
        self.l = l
        self.next_index = 0

    def next(self):
        if self.next_index < len(self.l):
            value = self.l[self.next_index]
            self.next_index += 1
            return value
        else:
            return None

    def isEOF(self):
        " Checks if the list has reached the end "
        return (self.next_index >= len(self.l))

class MetaUrl(object):
    "a url strecture data with many metadata"
    def __init__(self, url, source="", videoName="", quality="", youtube_watchurl=""):
        self.url = str(url)
        self.source = source
        self.videoName = videoName # Youtube Links Only
        self.quality = quality # Youtube Links Onlys
        self.youtube_watchurl = youtube_watchurl # Youtube Links Onlys

    def __repr__(self):
        return "<MetaUrl '%s' | %s>" % (self.url, self.source)


def search(song, n, processes=config.search_processes):
    '''
    Function searches song and returns n valid .mp3 links.
    @param song: Search string.
    @param n: Number of songs.
    @param processes: Number of processes to launch in the subprocessing pool.
    '''
    linksFromSources = []
    pool = multiprocessing.Pool(processes)

    args = [(song, source) for source in config.search_sources]
    imapObj = pool.imap_unordered(_parse_star, args)
    for i in range(len(args)):
        linksFromSources.append(NextList(imapObj.next(15)))
    pool.terminate()

    links = []
    next_source = 0
    while len(links) < n and not all(map(lambda x: x.isEOF(), linksFromSources)):
        nextItem = linksFromSources[next_source].next()
        if nextItem:
            log.debug("added song %.80s from source ID %d (%s)" % (nextItem.url.split('/')[-1], next_source, nextItem.source))
            links.append(nextItem)

        if len(linksFromSources) == next_source+1:
            next_source = 0
        else:
            next_source += 1

    return links

def _parse_star(args):
    return parse(*args)

1 个答案:

答案 0 :(得分:6)

我无法在我的机器上重现您的问题。您的processes变量中包含哪些内容?它是int吗?

Python 2.7.3 (default, Apr 10 2012, 23:31:26) [MSC v.1500 32 bit (Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> import multiprocessing.dummy as multiprocessing
>>> pool = multiprocessing.Pool(5)
>>> pool
<multiprocessing.pool.ThreadPool object at 0x00C7DF90>
>>>

---- ----编辑

您可能还想仔细检查是否搞砸了标准库,尝试在另一个文件夹中干净安装python 2.7.3。

----编辑2 ----

您可以像这样快速修补:

import multiprocessing.dummy
import weakref
import threading

class Worker(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        poll = multiprocessing.dummy.Pool(5)
        print str(poll)

w = Worker()
w._children = weakref.WeakKeyDictionary()
w.start()