我的消费者为什么不消费?

时间:2015-01-27 13:53:47

标签: python multiprocessing python-multithreading

编辑:我已经将问题跟踪为我的程序的一部分,我下载Zip文件并解析它。如果我对其进行评论并将其替换为默认行,则会毫无问题地解析10,000次。

不确定应该编辑多少这个问题来反映这一发现。

我编写了一个python程序,它下载一个包含大约10,000行的单个日志文件的zip文件。然后,它逐行解析此文件,并将数据放入数据库中。

最终我的脚本将运行200个服务器/ zip文件并处理大约100,000行。 (并非所有服务器都有所需的文件)

但是,目前当我用1个消费者运行脚本时,我只能将大约13行处理到数据库中。如果我运行2个消费者,我会得到24个。如果我运行10个消费者,我会得到100个,如果我运行20个消费者,我会得到240个。

有时,运行脚本的结果是" Consumer Finished"数据库中的条目数量(远远低于我期待的10K-30K)但是其他时候,我收到一条错误消息:

> Traceback (most recent call last):   File
> "C:\Python27\lib\multiprocessing\queues.py", line 262, in _feed
>     send(obj) IOError: [Errno 232] The pipe is being closed

可能导致此问题的原因是什么?附件是我的代码的修改版本,用于删除敏感数据:

import urllib, urlparse
import sys
import os
import datetime
from calculon import Calculon
import random
import pprint
import time
import random
import urllib, urlparse
import traceback
import psycopg2
import psycopg2.extras
from datetime import date, datetime, time, timedelta
import os.path
import requests
import io
import urllib2, cStringIO, zipfile
import re
import httplib
import urlparse

def daterange(start_date, end_date):
  for n in range(int((end_date - start_date).days)):
    yield start_date + timedelta(n)



def producer(args):
  print "Producing!"
  logdt_start = args["logdt_start"]
  logdt_end = args["logdt_end"]
  for single_date in daterange(logdt_start, logdt_end):
    logdt = single_date + timedelta(days=1)
    print "Reading log file..."
    for x in range(1,2):
      servername = "server-{0}".format("%02d" % (x,))
      filename = "zipped_log.log{0}".format(logdt.date().isoformat())
      url = "http://url.to.zip.file/{0}/{1}".format(servername, filename)
      zip_path = 'path/to/file/within/zip/{0}/{1}'.format(servername, filename)
      if httpExists(url):
        try:
           request = urllib2.urlopen(url)
           zipinmemory =  cStringIO.StringIO(request.read())
             with zipfile.ZipFile(zipinmemory) as archive:
               with archive.open(zip_path) as log:
                 print "File Found! Reading %s..." % filename
                 for line in log:
                  args["_queue"].put(line)

          print "Queue has approximatly {0} items".format(args["_queue"].qsize())

        except:
          print "exception could not load %s" % url
          traceback.print_exc()
    return True


def httpExists(url):
    host, path = urlparse.urlsplit(url)[1:3]
    found = 0
    try:
        connection = httplib.HTTPConnection(host)  ## Make HTTPConnection Object
        connection.request("HEAD", path)
        responseOb = connection.getresponse()      ## Grab HTTPResponse Object

        if responseOb.status == 200:
            found = 1
        #else:
            #print "Status %d %s : %s" % (responseOb.status, responseOb.reason, url)
    except Exception, e:
        print e.__class__,  e, url
    return found


def parse_log(line):
    if len(line) < 10 or line[0] != '[':
        return {}
    mod_line = line
    mod_line = mod_line.replace('  ', ' ') #whats this for?
    query_dict = {}
    match = re.search('([\d:\/\s]+)\sUTC', mod_line)
    s = match.start()
    e = match.end() - 5
    query_dict['date_ts'] = datetime.strptime(mod_line[s:e], '%d/%m/%Y %H:%M:%S:%f')
    e = e+2
    mod_line = mod_line[e:]
    match = re.search('(\w+)\sLogger:\s', mod_line)
    e = match.end()
    query_dict['status'] = match.group(1)
    mod_line = mod_line[e:]
    for key_value in re.split(',', mod_line):
        keypair = re.search('(\w+)=(\w+)', key_value)
        key = keypair.group(1)
        value = keypair.group(2)
        query_dict[key] = value
    return query_dict

def consumer(args):
  global consumed
  consumed += 1
  print "Consumed : {0}".format(consumed)
  try:
    db = args["db"]
    cname = args["cname"]
    arg_value = args["_value"]

    cur = db.cursor()
    error_count = 0


    if arg_value is None:
        print "Consumer Finished!"
        return False
    line = arg_value
    qdict = parse_log(line)

    if len(qdict) == 0:
        print "No data to consumer %s" % cname
        return None

    query = """
    INSERT INTO my_db(date_ts,
        status, cmd, creativeString, environment_id, client_type_id, platform_id, sn_type_id, user_id,
        device_id, invoker_sn_id, invoker_type, poster_sn_id, origin, event_type, creative_id, ch,
         src, afp, cmp, p1, p2,p3)
    VALUES (%(date_ts)s,%(status)s,%(cmd)s,%(creativeString)s,%(environment_id)s,%(client_type_id)s,%(platform_id)s,
    %(sn_type_id)s,%(user_id)s,%(device_id)s,%(invoker_sn_id)s,%(invoker_type)s,%(poster_sn_id)s,%(origin)s,
    %(event_type)s,%(creative_id)s,%(ch)s, %(src)s, %(afp)s, %(cmp)s,
    %(p1)s, %(p2)s, %(p3)s);
    """

    try:
      cur.execute(cur.mogrify(query, qdict))
      db.commit()
      global processed
      processed += 1
      print "processed : {0}".format(processed)
    except:
      error_count = error_count + 1
      print "ERROR in insert {0}".format(error_count)
      traceback.print_exc()
      print qdict
      sys.exit(2)   
  except:
      print "Error in parsing:  "  + val
      tracback.print_exc()
      sys.exit(12)

def main():
  log_start = datetime(2015,1,19);
  log_end = datetime(2015,1,20);
  consumer_args_list = []

  noOfConsumers = 1;
  for x in range(0, noOfConsumers):
    print "Creating Consumer {0}".format(x)
    print "Connecting to logs db..."
    db_name = 'mydb'
    connString = "dbname={0} host={1} port={2} user={3} password={4}".format(db_name, 'localhost', 5433, 'postgres',                                                                       'pword')
    db = psycopg2.connect(connString)
    consumer_args = {"cname": "CONSUMER_{0}".format(x), "db":db}
    consumer_args_list.append(consumer_args)

  calculon = Calculon( producer, 
        [{"logdt_start": log_start,
          "logdt_end": log_end}],
            True,
            consumer,
            consumer_args_list,
            True)
  result = calculon.start()

consumed = 0
processed = 0
if __name__ == "__main__":
  main()

输出如下:

> Creating Consumer 0
Connecting to logs db...
Producing!
Reading log file...
File Found! Reading log2015-01-20...
Queue has approximatly 9549 items
Consumed : 1
processed : 1
Consumed : 2
processed : 2
Consumed : 3
processed : 3
Consumed : 4
processed : 4
Consumed : 5
processed : 5
Consumed : 6
processed : 6
Consumed : 7
processed : 7
Consumed : 8
processed : 8
Consumed : 9
processed : 9
Consumed : 10
processed : 10
Consumed : 11
processed : 11
Consumed : 12
processed : 12
Consumed : 13
Traceback (most recent call last):
  File "C:\Python27\lib\multiprocessing\queues.py", line 262, in _feed
    send(obj)
IOError: [Errno 232] The pipe is being closed

1 个答案:

答案 0 :(得分:0)

错误结果是输入文件中的错误行,这打破了正则表达式。

例如:逗号分隔列表的其中一个值为:foobar=2, foo=Something here, is ,a really, poor value, bar=2

我能够通过在使用者方法中添加以下代码来解决问题:

    try:
      qdict = parse_adx_client_log(line)
    except:
      qdict = {}
      print "BAD LINE {0}".format(line)

    if len(qdict) == 0:
       print "No data to consumer %s" % cname
       return None