假设这是一个大文件。
我想将文件分成多个块。 例如,如果我有一个500 MB的JSON文件,并且想将其拆分为不同的块。 可接受的最大文件大小为30 MB(30000000字节)。 请问我该怎么做?
{
"start":"HelloI",
"users": [
{
"id": 1,
"name": "Leanne Graham",
"username": "Bret",
"email": "Sincere@april.biz",
"address": {
"street": "Kulas Light",
"suite": "Apt. 556",
"city": "Gwenborough",
"zipcode": "92998-3874",
"geo": {
"lat": "-37.3159",
"lng": "81.1496"
}
},
"phone": "1-770-736-8031 x56442",
"website": "hildegard.org",
"company": {
"name": "Romaguera-Crona",
"catchPhrase": "Multi-layered client-server neural-net",
"bs": "harness real-time e-markets"
}
},
{
"id": 2,
"name": "Ervin Howell",
"username": "Antonette",
"email": "Shanna@melissa.tv",
"address": {
"street": "Victor Plains",
"suite": "Suite 879",
"city": "Wisokyburgh",
"zipcode": "90566-7771",
"geo": {
"lat": "-43.9509",
"lng": "-34.4618"
}
},
"phone": "010-692-6593 x09125",
"website": "anastasia.net",
"company": {
"name": "Deckow-Crist",
"catchPhrase": "Proactive didactic contingency",
"bs": "synergize scalable supply-chains"
}
},
{
"id": 3,
"name": "Clementine Bauch",
"username": "Samantha",
"email": "Nathan@yesenia.net",
"address": {
"street": "Douglas Extension",
"suite": "Suite 847",
"city": "McKenziehaven",
"zipcode": "59590-4157",
"geo": {
"lat": "-68.6102",
"lng": "-47.0653"
}
},
"phone": "1-463-123-4447",
"website": "ramiro.info",
"company": {
"name": "Romaguera-Jacobson",
"catchPhrase": "Face to face bifurcated interface",
"bs": "e-enable strategic applications"
}
},
{
"id": 4,
"name": "Patricia Lebsack",
"username": "Karianne",
"email": "Julianne.OConner@kory.org",
"address": {
"street": "Hoeger Mall",
"suite": "Apt. 692",
"city": "South Elvis",
"zipcode": "53919-4257",
"geo": {
"lat": "29.4572",
"lng": "-164.2990"
}
},
"phone": "493-170-9623 x156",
"website": "kale.biz",
"company": {
"name": "Robel-Corkery",
"catchPhrase": "Multi-tiered zero tolerance productivity",
"bs": "transition cutting-edge web services"
}
},
{
"id": 5,
"name": "Chelsey Dietrich",
"username": "Kamren",
"email": "Lucio_Hettinger@annie.ca",
"address": {
"street": "Skiles Walks",
"suite": "Suite 351",
"city": "Roscoeview",
"zipcode": "33263",
"geo": {
"lat": "-31.8129",
"lng": "62.5342"
}
},
"phone": "(254)954-1289",
"website": "demarco.info",
"company": {
"name": "Keebler LLC",
"catchPhrase": "User-centric fault-tolerant solution",
"bs": "revolutionize end-to-end systems"
}
},
{
"id": 6,
"name": "Mrs. Dennis Schulist",
"username": "Leopoldo_Corkery",
"email": "Karley_Dach@jasper.info",
"address": {
"street": "Norberto Crossing",
"suite": "Apt. 950",
"city": "South Christy",
"zipcode": "23505-1337",
"geo": {
"lat": "-71.4197",
"lng": "71.7478"
}
},
"phone": "1-477-935-8478 x6430",
"website": "ola.org",
"company": {
"name": "Considine-Lockman",
"catchPhrase": "Synchronised bottom-line interface",
"bs": "e-enable innovative applications"
}
},
{
"id": 7,
"name": "Kurtis Weissnat",
"username": "Elwyn.Skiles",
"email": "Telly.Hoeger@billy.biz",
"address": {
"street": "Rex Trail",
"suite": "Suite 280",
"city": "Howemouth",
"zipcode": "58804-1099",
"geo": {
"lat": "24.8918",
"lng": "21.8984"
}
},
"phone": "210.067.6132",
"website": "elvis.io",
"company": {
"name": "Johns Group",
"catchPhrase": "Configurable multimedia task-force",
"bs": "generate enterprise e-tailers"
}
},
{
"id": 8,
"name": "Nicholas Runolfsdottir V",
"username": "Maxime_Nienow",
"email": "Sherwood@rosamond.me",
"address": {
"street": "Ellsworth Summit",
"suite": "Suite 729",
"city": "Aliyaview",
"zipcode": "45169",
"geo": {
"lat": "-14.3990",
"lng": "-120.7677"
}
},
"phone": "586.493.6943 x140",
"website": "jacynthe.com",
"company": {
"name": "Abernathy Group",
"catchPhrase": "Implemented secondary concept",
"bs": "e-enable extensible e-tailers"
}
},
{
"id": 9,
"name": "Glenna Reichert",
"username": "Delphine",
"email": "Chaim_McDermott@dana.io",
"address": {
"street": "Dayna Park",
"suite": "Suite 449",
"city": "Bartholomebury",
"zipcode": "76495-3109",
"geo": {
"lat": "24.6463",
"lng": "-168.8889"
}
},
"phone": "(775)976-6794 x41206",
"website": "conrad.com",
"company": {
"name": "Yost and Sons",
"catchPhrase": "Switchable contextually-based project",
"bs": "aggregate real-time technologies"
}
},
{
"id": 10,
"name": "Clementina DuBuque",
"username": "Moriah.Stanton",
"email": "Rey.Padberg@karina.biz",
"address": {
"street": "Kattie Turnpike",
"suite": "Suite 198",
"city": "Lebsackbury",
"zipcode": "31428-2261",
"geo": {
"lat": "-38.2386",
"lng": "57.2232"
}
},
"phone": "024-648-3804",
"website": "ambrose.net",
"company": {
"name": "Hoeger LLC",
"catchPhrase": "Centralized empowering task-force",
"bs": "target end-to-end models"
}
}
]
}
这是我的代码。 我相信我做错了什么。 任何帮助,将不胜感激 谢谢。
json_size = 50580490;
MIN_SIZE = 30000000;
data_len = len(file)
get_array_length = len(file["users"])
print("Print data len : ",data_len)
print("Print Get Array length : ", get_array_length)
items = []
if isinstance(file, dict):
print('Valid JSON file found')
# determine number of files necessary
split_into_files = math.ceil(json_size/MIN_SIZE)
print(f'File will be split into {split_into_files} equal parts')
split_data = [[] for i in range(0,split_into_files)]
print('split_data : ', split_data)
starts = [math.floor(i * get_array_length/split_into_files) for i in range(0,split_into_files)]
starts.append(data_len)
print('starts : ', starts)
for i in range(0,split_into_files):
for n in range(starts[i], starts[i+1]):
print('The value for N is: ' , n)
print("split_data[i] :" , split_data[i])
#print(file["users"][n])
split_data[i].append(file["users"][n])
print(split_data[i])
答案 0 :(得分:1)
我的解决方案 可能不是最好的,但它确实有效。
import json
import boto3
import os
import time
import math
# Variable definition.
SESSION_STORAGE = os.environ['JSON_BUCKET']
SESSION = boto3.session.Session()
CURRENT_REGION = SESSION.region_name
S3_CLIENT = boto3.client("s3")
MIN_SIZE = 16000000 # 16 mb
def handler(event, context):
# Instantiate start time.
start_time = time.time()
# Bucket Name where file was uploaded
#bucket = event['Records'][0]['s3']['bucket']['name']
bucket = 'json-upload-bucket' # => testing bucket.
#file_key_name = event['Records'][0]['s3']['object']['key'] # Use this to make it dynamic.
file_key_name = 'XXXXXXXXX-users.json' # This is for testing only.
print("File Key Name", file_key_name)
response = S3_CLIENT.get_object(Bucket=bucket, Key=file_key_name)
#print("Response : ", response)
# json size
json_size = response['ContentLength']
print("json_size : ", json_size)
# Reading content
content = response['Body']
jsonObject = json.loads(content.read())
data = jsonObject['users']
data_len = len(jsonObject)
#print('Length of JSON : ', data_len)
#print('Order array length : ', len(data))
if isinstance(data, list):
data_len = len(data)
print('Valid JSON file found')
if(json_size <= MIN_SIZE):
print('File meets the minimum size.')
else:
# determine number of files necessary
split_into_files = math.ceil(json_size/MIN_SIZE)
print(f'File will be split into {split_into_files} equal parts')
# initialize 2D array
split_data = [[] for i in range(0,split_into_files)]
# determine indices of cutoffs in array
starts = [math.floor(i * data_len/split_into_files) for i in range(0,split_into_files)]
starts.append(data_len)
# loop through 2D array
for i in range(0,split_into_files):
# loop through each range in array
for n in range(starts[i],starts[i+1]):
split_data[i].append(data[n])
print(file_key_name.split('.')[0] + '_' + str(i+1) + '.json')
name = os.path.basename(file_key_name).split('.')[0] + '_' + str(i+1) + '.json'
print('Name : ', name)
folder = '/tmp/'+name
with open(folder, 'w') as outfile:
# restructure the json back to its original state.
generated_json = {
list(jsonObject.keys())[0] : list(jsonObject.values())[0],
list(jsonObject.keys())[1] : split_data[i]}
json.dump(generated_json, outfile, indent=4)
S3_CLIENT.upload_file(folder, bucket, name)
print('Part',str(i+1),'... completed')
else:
print("JSON is not an Array of Objects")
return {
'statusCode': 200,
'body': json.dumps('JSON split completed checks s3.')
}
答案 1 :(得分:0)
似乎您正在以原始形式拆分数据,这意味着json是分层结构,当您直接拆分数据时,它将无法识别记录,而可能会破坏结构。
您可以先将用户元素读取到其他任何结构中,例如列表/数据框架。
with open('users.json','r') as f:
user_list = json.load(f)
users_data = user_list['users']
(您需要从json文件中的用户列表开始读取,因为文件中还有另一列-例如“开始”)
然后,您将在users_data中拥有所有记录,然后根据json记录数可以进行相应拆分。 如果要在此过程中增加一些性能以备将来使用-可以对users_data中的记录进行排序,然后将记录拆分为单独的json文件。