您好我有一个脚本将Mongo文档转换为写入文件的JSON。要在1000个文档上运行脚本大约需要45秒。
我现在正在寻求优化代码以显着缩短时间,并且想知道选项是做什么的。
我知道有硬件选项,例如在具有更快硬盘驱动器的群集上运行,但我也想知道我可以对算法和代码的其他区域进行任何优化。
查看分析器,大部分时间都花在
{method 'recv' of '_socket.socket' objects}
(不确定这是做什么)和encoder.py:212(iterencode)
(转换为JSON)。它们是内置函数,但其余的耗时函数是用户定义的。
我正在考虑的一些可能的解决方案是:1。使用writeline复制字符串列表2.使用像Cython 3这样的包编译成C代码。使用更快的数据结构和算法过程。
至于3.我不确定如何实现更快的算法,因为我必须通过每个文档的嵌套层来创建条目。
具体对于每个文档,都有由键“d”表示的子文档。每个子文档还包含与' AT'例如' AT432'我必须为每个AT'创建一个单独的条目。所以一个文件可以产生50个单独的条目。
我愿意接受任何想法,因为我真的希望减少转换每个文档所花费的时间。感谢您的投入!
档案1
from pymongo import MongoClient
import cProfile
import sys
import time
from mapping_functions import create_files
from datetime import datetime
from date_functions import get_array_of_dates
pr = cProfile.Profile()
pr.enable()
start_time = time.time()
# A. Connect to a collection in MongoDB
client = MongoClient('mongo-host-ip', 27017)
db = client.db_name
collection = db.collection_name
# Set time range for query in year, month, day, hour
start = datetime(2017, 12, 29, 10)
end = datetime(2017, 12, 29, 11)
delta_in_minutes = 60
num_documents = 1000
dates = get_array_of_dates(start, end, delta_in_minutes)
create_files(collection, 'collection_name', dates, start_time, num_documents)
pr.disable()
pr.print_stats(sort='time')
文件2
import dateutil.parser
import calendar
import datetime
import time
import json
from bson import json_util
import os
# creates new folder
def createFolder(directory):
try:
if not os.path.exists(directory):
os.makedirs(directory)
except OSError:
print ('Error: Creating directory. ' + directory)
# creates a file with the entries for each hour
def create_files(collection_cursor, collection_name, dates, start_time,
num_documents):
createFolder('./data_hour/' + collection_name + '/')
for string_time in dates:
python_datetime = dateutil.parser.parse(string_time)
documents = collection_cursor.find({'T': python_datetime}).limit(num_documents)
text_file = open('./data_hour/' + collection_name + '/' + string_time +
".json", "a")
document_parser(collection_name, documents, text_file)
text_file.close()
# exports sql entries to a file
def document_parser(collection_name, documents, text_file):
list = []
for doc in documents:
list_of_subdocuments = subdocument_parser(doc['d'])
for subdoc in list_of_subdocuments:
actiontypes = actiontype_parser(subdoc)
for actiontype in actiontypes:
sql_entry = nosql_to_sql(collection_name, doc, subdoc,
actiontype)
text_file.write(json.dumps(sql_entry,
separators=(',', ':'),
default=json_util.default) + '\n')
return
# returns a list of subdocuments
def subdocument_parser(subdocuments):
subdocs = []
for subdoc in subdocuments:
subdocs.append(subdoc)
return subdocs
# returns a list of action types
def actiontype_parser(subdocument):
actiontypes = []
# Find all of the keys that represent action types
keys = subdocument.keys()
for key in keys:
if key[:2] == 'AT':
actiontypes.append(key)
return actiontypes
def handle_arrays(json, document, key, full_key):
sorted_array = sorted(document[key])
comma_separated_string = ','.join(map(str, sorted_array))
json[full_key] = comma_separated_string
return
def handle_time(json, document, key, full_key):
epoch = document[key]
seconds = calendar.timegm(epoch.utctimetuple())
json[full_key] = seconds
return
# Copy all keys in the document except 'd'
def copy_document_keys(json, mapping, collection_name, document,
document_keys):
for key in document_keys:
if key in prohibited:
continue
if key != 'd':
full_key = mapping[key]
# sort array and convert to csv
if key == 'MP' and collection_name in inventory_or_auction:
full_key += 's'
handle_arrays(json, document, key, full_key)
continue
if key == 'T':
handle_time(json, document, key, full_key)
continue
json[full_key] = document[key]
return
# Copy all the keys in a subdocument except for action types
def copy_subdocument_keys(json, mapping, subdocument, subdocument_keys):
for key in subdocument_keys:
if key in prohibited:
continue
if key[:2] != 'AT':
full_key = mapping[key]
# sort array and convert to csv
if key == 'UI' or key == 'DA' or key == 'F':
handle_arrays(json, subdocument, key, full_key)
continue
json[full_key] = subdocument[key]
return
# copy misc. keys
def copy_misc_keys(json, subdocument, actiontype):
# If there is no recordType key, then put default value
if 'recordType' not in json:
json['recordType'] = 1
# Copy the action type
json['actionType'] = int(actiontype[2:])
json['count'] = int(subdocument[actiontype])
return
# return a sql entry
def nosql_to_sql(collection_name, document, subdocument, actiontype):
json = {}
# Copy all keys in the document except 'd'
document_keys = document.keys()
copy_document_keys(json, mapping, collection_name, document, document_keys)
# Copy all the keys in a subdocument except for action types
subdocument_keys = subdocument.keys()
copy_subdocument_keys(json, mapping, subdocument, subdocument_keys)
# copy action types
copy_misc_keys(json, subdocument, actiontype)
return json
Mongo文件
{
"_id" : ObjectId("7dfgdftew564324546ff3"),
"T" : ISODate("2011-10-13T07:00:00Z"),
"MP" : [
40,
16,
13,
11,
1
],
"P" : 3881,
"PB" : 12285,
"d" : [
{
"D" : 32,
"DL" : 0,
"ST" : 1007,
"AT315" : NumberLong(5),
"AT328" : NumberLong(14),
"AT331" : NumberLong(19),
"AT306" : NumberLong(19),
"AT100331" : NumberLong(431),
"AT500" : 0
},
{
"D" : 16,
"DL" : 0,
"ST" : 1007,
"AT328" : NumberLong(28),
"AT315" : NumberLong(8),
"AT331" : NumberLong(36),
"AT306" : NumberLong(36),
"AT100331" : NumberLong(953),
"AT500" : 0
},
{
"D" : 1,
"DL" : 0,
"ST" : 1007,
"AT315" : NumberLong(29),
"AT331" : NumberLong(34),
"AT328" : NumberLong(5),
"AT306" : NumberLong(34),
"AT100331" : NumberLong(803),
"AT500" : 0
},
{
"D" : 2,
"DL" : 0,
"ST" : 1007,
"AT328" : NumberLong(1),
"AT100331" : NumberLong(82),
"AT306" : NumberLong(1),
"AT331" : NumberLong(1),
"AT500" : 0
}
],
"bn" : NumberLong(21137)
}
写入文件的结果条目
{"count":254,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":100331,"recordType":1}
{"count":7,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":306,"recordType":1}
{"count":7,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":331,"recordType":1}
{"count":6,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":315,"recordType":1}
{"count":1,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":328,"recordType":1}
{"count":0,"publisher":730,"marketPlaces":"1,13","publication":6452,"deal":0,"hour":1514541600,"deviceType":1,"provider":1011,"actionType":500,"recordType":1} ...
分析结果
30567605 function calls (30567594 primitive calls) in 48.483 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
5476 6.992 0.001 6.992 0.001 {method 'recv' of '_socket.socket' objects}
1194719 5.489 0.000 5.489 0.000 encoder.py:212(iterencode)
1194719 4.869 0.000 4.869 0.000 mapping_functions.py:180(copy_subdocument_keys)
1194719 4.254 0.000 11.878 0.000 mapping_functions.py:160(copy_document_keys)
1194719 3.223 0.000 13.123 0.000 __init__.py:193(dumps)
2 3.175 1.587 48.480 24.240 mapping_functions.py:99(document_parser)
1194719 2.032 0.000 2.032 0.000 mapping_functions.py:195(copy_misc_keys)
1194719 1.810 0.000 21.245 0.000 mapping_functions.py:207(nosql_to_sql)
1194719 1.726 0.000 8.666 0.000 encoder.py:186(encode)
6 1.606 0.268 1.611 0.268 {bson._cbson.decode_all}
1194719 1.438 0.000 1.438 0.000 {method 'utctimetuple' of 'datetime.datetime' objects}
1194719 1.320 0.000 1.438 0.000 calendar.py:611(timegm)
1194719 1.261 0.000 3.619 0.000 mapping_functions.py:145(handle_arrays)
1194719 1.234 0.000 1.234 0.000 encoder.py:101(__init__)
1194720 1.227 0.000 1.227 0.000 {map}
1194719 1.130 0.000 4.006 0.000 mapping_functions.py:152(handle_time)
2389438 1.025 0.000 1.025 0.000 {method 'join' of 'str' objects}
1194719 0.929 0.000 0.929 0.000 {sorted}
1194719 0.818 0.000 0.818 0.000 {method 'write' of 'file' objects}
2555978 0.718 0.000 0.718 0.000 {method 'keys' of 'dict' objects}
2391505 0.628 0.000 0.628 0.000 {isinstance}
166540 0.573 0.000 0.788 0.000 mapping_functions.py:134(actiontype_parser)
2558008 0.339 0.000 0.339 0.000 {time.time}
1361313 0.165 0.000 0.165 0.000 {method 'append' of 'list' objects}
7 0.123 0.018 0.123 0.018 {time.sleep}
12 0.121 0.010 7.118 0.593 network.py:166(_receive_data_on_socket)
1194719 0.118 0.000 0.118 0.000 {method 'toordinal' of 'datetime.date' objects}
2000 0.037 0.000 0.050 0.000 mapping_functions.py:125(subdocument_parser)
1 0.035 0.035 0.035 0.035 {method 'connect' of '_socket.socket' objects}
6 0.030 0.005 0.031 0.005 message.py:953(unpack)
2002 0.010 0.000 8.942 0.004 cursor.py:1165(next)
9567 0.006 0.000 0.006 0.000 {len}
2000 0.004 0.000 0.004 0.000 database.py:402(_fix_outgoing)
2001 0.004 0.000 0.005 0.000 objectid.py:68(__init__)
6 0.002 0.000 7.151 1.192 network.py:143(receive_message)
7 0.001 0.000 8.926 1.275 cursor.py:1057(_refresh)
1 0.001 0.001 0.001 0.001 {_socket.getaddrinfo}
2022 0.001 0.000 0.001 0.000 collection.py:306(database)
2 0.001 0.000 0.001 0.000 {method 'close' of 'file' objects}