优化迭代数百万个文档的Python代码

时间:2018-06-11 21:57:58

标签: python mongodb python-2.7 performance performance-testing

上下文

您好我有一个脚本将Mongo文档转换为写入文件的JSON。要在1000个文档上运行脚本大约需要45秒。

困境

然而,Mongo系列有近1800万份文件,因此整个系列需要大约200个小时。

我现在正在寻求优化代码以显着缩短时间,并且想知道选项是做什么的。

可能的解决方案

我知道有硬件选项,例如在具有更快硬盘驱动器的群集上运行,但我也想知道我可以对算法和代码的其他区域进行任何优化。

查看分析器,大部分时间都花在 {method 'recv' of '_socket.socket' objects}(不确定这是做什么)和encoder.py:212(iterencode)(转换为JSON)。它们是内置函数,但其​​余的耗时函数是用户定义的。

我正在考虑的一些可能的解决方案是:1。使用writeline复制字符串列表2.使用像Cython 3这样的包编译成C代码。使用更快的数据结构和算法过程。

至于3.我不确定如何实现更快的算法,因为我必须通过每个文档的嵌套层来创建条目。

具体对于每个文档,都有由键“d”表示的子文档。每个子文档还包含与' AT'例如' AT432'我必须为每个AT'创建一个单独的条目。所以一个文件可以产生50个单独的条目。

欢迎帮助!

我愿意接受任何想法,因为我真的希望减少转换每个文档所花费的时间。感谢您的投入!

档案1

from pymongo import MongoClient
import cProfile
import sys
import time
from mapping_functions import create_files
from datetime import datetime
from date_functions import get_array_of_dates

pr = cProfile.Profile()
pr.enable()

start_time = time.time()

# A. Connect to a collection in MongoDB
client = MongoClient('mongo-host-ip', 27017)
db = client.db_name
collection = db.collection_name

# Set time range for query in year, month, day, hour
start = datetime(2017, 12, 29, 10)
end = datetime(2017, 12, 29, 11)
delta_in_minutes = 60

num_documents = 1000
dates = get_array_of_dates(start, end, delta_in_minutes)

create_files(collection, 'collection_name', dates, start_time, num_documents)
pr.disable()
pr.print_stats(sort='time')

文件2

import dateutil.parser
import calendar
import datetime
import time
import json
from bson import json_util
import os


# creates new folder
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' + directory)


# creates a file with the entries for each hour
def create_files(collection_cursor, collection_name, dates, start_time, 
                 num_documents):
    createFolder('./data_hour/' + collection_name + '/')
    for string_time in dates:
        python_datetime = dateutil.parser.parse(string_time)
        documents = collection_cursor.find({'T': python_datetime}).limit(num_documents)
        text_file = open('./data_hour/' + collection_name + '/' + string_time +
                         ".json", "a")
        document_parser(collection_name, documents, text_file)
        text_file.close()


# exports sql entries to a file
def document_parser(collection_name, documents, text_file):
    list = []
    for doc in documents:
        list_of_subdocuments = subdocument_parser(doc['d'])
        for subdoc in list_of_subdocuments:
            actiontypes = actiontype_parser(subdoc)
            for actiontype in actiontypes:
                sql_entry = nosql_to_sql(collection_name, doc, subdoc,
                                         actiontype)
                text_file.write(json.dumps(sql_entry,
                                           separators=(',', ':'),
                                           default=json_util.default) + '\n')
    return


# returns a list of subdocuments
def subdocument_parser(subdocuments):
    subdocs = []

    for subdoc in subdocuments:
        subdocs.append(subdoc)
    return subdocs


# returns a list of action types
def actiontype_parser(subdocument):
    actiontypes = []

    # Find all of the keys that represent action types
    keys = subdocument.keys()
    for key in keys:
        if key[:2] == 'AT':
            actiontypes.append(key)
    return actiontypes


def handle_arrays(json, document, key, full_key):
    sorted_array = sorted(document[key])
    comma_separated_string = ','.join(map(str, sorted_array))
    json[full_key] = comma_separated_string
    return


def handle_time(json, document, key, full_key):
    epoch = document[key]
    seconds = calendar.timegm(epoch.utctimetuple())
    json[full_key] = seconds
    return


# Copy all keys in the document except 'd'
def copy_document_keys(json, mapping, collection_name, document,
                       document_keys):
    for key in document_keys:
        if key in prohibited:
            continue
        if key != 'd':
            full_key = mapping[key]
            # sort array and convert to csv
            if key == 'MP' and collection_name in inventory_or_auction:
                full_key += 's'
                handle_arrays(json, document, key, full_key)
                continue
            if key == 'T':
                handle_time(json, document, key, full_key)
                continue
            json[full_key] = document[key]
    return


# Copy all the keys in a subdocument except for action types
def copy_subdocument_keys(json, mapping, subdocument, subdocument_keys):
    for key in subdocument_keys:
        if key in prohibited:
            continue
        if key[:2] != 'AT':
            full_key = mapping[key]
            # sort array and convert to csv
            if key == 'UI' or key == 'DA' or key == 'F':
                handle_arrays(json, subdocument, key, full_key)
                continue
            json[full_key] = subdocument[key]
    return


# copy misc. keys
def copy_misc_keys(json, subdocument, actiontype):
    # If there is no recordType key, then put default value
    if 'recordType' not in json:
        json['recordType'] = 1

    # Copy the action type
    json['actionType'] = int(actiontype[2:])
    json['count'] = int(subdocument[actiontype])
    return


# return a sql entry
def nosql_to_sql(collection_name, document, subdocument, actiontype):
    json = {}

    # Copy all keys in the document except 'd'
    document_keys = document.keys()
    copy_document_keys(json, mapping, collection_name, document, document_keys)

    # Copy all the keys in a subdocument except for action types
    subdocument_keys = subdocument.keys()
    copy_subdocument_keys(json, mapping, subdocument, subdocument_keys)

    # copy action types
    copy_misc_keys(json, subdocument, actiontype)

    return json

Mongo文件

{
    "_id" : ObjectId("7dfgdftew564324546ff3"),
    "T" : ISODate("2011-10-13T07:00:00Z"),
    "MP" : [
        40,
        16,
        13,
        11,
        1
    ],
    "P" : 3881,
    "PB" : 12285,
    "d" : [
        {
            "D" : 32,
            "DL" : 0,
            "ST" : 1007,
            "AT315" : NumberLong(5),
            "AT328" : NumberLong(14),
            "AT331" : NumberLong(19),
            "AT306" : NumberLong(19),
            "AT100331" : NumberLong(431),
            "AT500" : 0
        },
        {
            "D" : 16,
            "DL" : 0,
            "ST" : 1007,
            "AT328" : NumberLong(28),
            "AT315" : NumberLong(8),
            "AT331" : NumberLong(36),
            "AT306" : NumberLong(36),
            "AT100331" : NumberLong(953),
            "AT500" : 0
        },
        {
            "D" : 1,
            "DL" : 0,
            "ST" : 1007,
            "AT315" : NumberLong(29),
            "AT331" : NumberLong(34),
            "AT328" : NumberLong(5),
            "AT306" : NumberLong(34),
            "AT100331" : NumberLong(803),
            "AT500" : 0
        },
        {
            "D" : 2,
            "DL" : 0,
            "ST" : 1007,
            "AT328" : NumberLong(1),
            "AT100331" : NumberLong(82),
            "AT306" : NumberLong(1),
            "AT331" : NumberLong(1),
            "AT500" : 0
        }
    ],
    "bn" : NumberLong(21137)
}

写入文件的结果条目

   {"count":254,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":100331,"recordType":1}
{"count":7,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":306,"recordType":1}
{"count":7,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":331,"recordType":1}
{"count":6,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":315,"recordType":1}
{"count":1,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":328,"recordType":1}
{"count":0,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":500,"recordType":1} ...

分析结果

         30567605 function calls (30567594 primitive calls) in 48.483 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     5476    6.992    0.001    6.992    0.001 {method 'recv' of '_socket.socket' objects}
  1194719    5.489    0.000    5.489    0.000 encoder.py:212(iterencode)
  1194719    4.869    0.000    4.869    0.000 mapping_functions.py:180(copy_subdocument_keys)
  1194719    4.254    0.000   11.878    0.000 mapping_functions.py:160(copy_document_keys)
  1194719    3.223    0.000   13.123    0.000 __init__.py:193(dumps)
        2    3.175    1.587   48.480   24.240 mapping_functions.py:99(document_parser)
  1194719    2.032    0.000    2.032    0.000 mapping_functions.py:195(copy_misc_keys)
  1194719    1.810    0.000   21.245    0.000 mapping_functions.py:207(nosql_to_sql)
  1194719    1.726    0.000    8.666    0.000 encoder.py:186(encode)
        6    1.606    0.268    1.611    0.268 {bson._cbson.decode_all}
  1194719    1.438    0.000    1.438    0.000 {method 'utctimetuple' of 'datetime.datetime' objects}
  1194719    1.320    0.000    1.438    0.000 calendar.py:611(timegm)
  1194719    1.261    0.000    3.619    0.000 mapping_functions.py:145(handle_arrays)
  1194719    1.234    0.000    1.234    0.000 encoder.py:101(__init__)
  1194720    1.227    0.000    1.227    0.000 {map}
  1194719    1.130    0.000    4.006    0.000 mapping_functions.py:152(handle_time)
  2389438    1.025    0.000    1.025    0.000 {method 'join' of 'str' objects}
  1194719    0.929    0.000    0.929    0.000 {sorted}
  1194719    0.818    0.000    0.818    0.000 {method 'write' of 'file' objects}
  2555978    0.718    0.000    0.718    0.000 {method 'keys' of 'dict' objects}
  2391505    0.628    0.000    0.628    0.000 {isinstance}
   166540    0.573    0.000    0.788    0.000 mapping_functions.py:134(actiontype_parser)
  2558008    0.339    0.000    0.339    0.000 {time.time}
  1361313    0.165    0.000    0.165    0.000 {method 'append' of 'list' objects}
        7    0.123    0.018    0.123    0.018 {time.sleep}
       12    0.121    0.010    7.118    0.593 network.py:166(_receive_data_on_socket)
  1194719    0.118    0.000    0.118    0.000 {method 'toordinal' of 'datetime.date' objects}
     2000    0.037    0.000    0.050    0.000 mapping_functions.py:125(subdocument_parser)
        1    0.035    0.035    0.035    0.035 {method 'connect' of '_socket.socket' objects}
        6    0.030    0.005    0.031    0.005 message.py:953(unpack)
     2002    0.010    0.000    8.942    0.004 cursor.py:1165(next)
     9567    0.006    0.000    0.006    0.000 {len}
     2000    0.004    0.000    0.004    0.000 database.py:402(_fix_outgoing)
     2001    0.004    0.000    0.005    0.000 objectid.py:68(__init__)
        6    0.002    0.000    7.151    1.192 network.py:143(receive_message)
        7    0.001    0.000    8.926    1.275 cursor.py:1057(_refresh)
        1    0.001    0.001    0.001    0.001 {_socket.getaddrinfo}
     2022    0.001    0.000    0.001    0.000 collection.py:306(database)
        2    0.001    0.000    0.001    0.000 {method 'close' of 'file' objects}

0 个答案:

没有答案