Question

假设这是一个大文件。

我想将文件分成多个块。例如，如果我有一个500 MB的JSON文件，并且想将其拆分为不同的块。可接受的最大文件大小为30 MB（30000000字节）。请问我该怎么做？

{
   "start":"HelloI",
   "users": [
  {
    "id": 1,
    "name": "Leanne Graham",
    "username": "Bret",
    "email": "Sincere@april.biz",
    "address": {
      "street": "Kulas Light",
      "suite": "Apt. 556",
      "city": "Gwenborough",
      "zipcode": "92998-3874",
      "geo": {
        "lat": "-37.3159",
        "lng": "81.1496"
      }
    },
    "phone": "1-770-736-8031 x56442",
    "website": "hildegard.org",
    "company": {
      "name": "Romaguera-Crona",
      "catchPhrase": "Multi-layered client-server neural-net",
      "bs": "harness real-time e-markets"
    }
  },
  {
    "id": 2,
    "name": "Ervin Howell",
    "username": "Antonette",
    "email": "Shanna@melissa.tv",
    "address": {
      "street": "Victor Plains",
      "suite": "Suite 879",
      "city": "Wisokyburgh",
      "zipcode": "90566-7771",
      "geo": {
        "lat": "-43.9509",
        "lng": "-34.4618"
      }
    },
    "phone": "010-692-6593 x09125",
    "website": "anastasia.net",
    "company": {
      "name": "Deckow-Crist",
      "catchPhrase": "Proactive didactic contingency",
      "bs": "synergize scalable supply-chains"
    }
  },
  {
    "id": 3,
    "name": "Clementine Bauch",
    "username": "Samantha",
    "email": "Nathan@yesenia.net",
    "address": {
      "street": "Douglas Extension",
      "suite": "Suite 847",
      "city": "McKenziehaven",
      "zipcode": "59590-4157",
      "geo": {
        "lat": "-68.6102",
        "lng": "-47.0653"
      }
    },
    "phone": "1-463-123-4447",
    "website": "ramiro.info",
    "company": {
      "name": "Romaguera-Jacobson",
      "catchPhrase": "Face to face bifurcated interface",
      "bs": "e-enable strategic applications"
    }
  },
  {
    "id": 4,
    "name": "Patricia Lebsack",
    "username": "Karianne",
    "email": "Julianne.OConner@kory.org",
    "address": {
      "street": "Hoeger Mall",
      "suite": "Apt. 692",
      "city": "South Elvis",
      "zipcode": "53919-4257",
      "geo": {
        "lat": "29.4572",
        "lng": "-164.2990"
      }
    },
    "phone": "493-170-9623 x156",
    "website": "kale.biz",
    "company": {
      "name": "Robel-Corkery",
      "catchPhrase": "Multi-tiered zero tolerance productivity",
      "bs": "transition cutting-edge web services"
    }
  },
  {
    "id": 5,
    "name": "Chelsey Dietrich",
    "username": "Kamren",
    "email": "Lucio_Hettinger@annie.ca",
    "address": {
      "street": "Skiles Walks",
      "suite": "Suite 351",
      "city": "Roscoeview",
      "zipcode": "33263",
      "geo": {
        "lat": "-31.8129",
        "lng": "62.5342"
      }
    },
    "phone": "(254)954-1289",
    "website": "demarco.info",
    "company": {
      "name": "Keebler LLC",
      "catchPhrase": "User-centric fault-tolerant solution",
      "bs": "revolutionize end-to-end systems"
    }
  },
  {
    "id": 6,
    "name": "Mrs. Dennis Schulist",
    "username": "Leopoldo_Corkery",
    "email": "Karley_Dach@jasper.info",
    "address": {
      "street": "Norberto Crossing",
      "suite": "Apt. 950",
      "city": "South Christy",
      "zipcode": "23505-1337",
      "geo": {
        "lat": "-71.4197",
        "lng": "71.7478"
      }
    },
    "phone": "1-477-935-8478 x6430",
    "website": "ola.org",
    "company": {
      "name": "Considine-Lockman",
      "catchPhrase": "Synchronised bottom-line interface",
      "bs": "e-enable innovative applications"
    }
  },
  {
    "id": 7,
    "name": "Kurtis Weissnat",
    "username": "Elwyn.Skiles",
    "email": "Telly.Hoeger@billy.biz",
    "address": {
      "street": "Rex Trail",
      "suite": "Suite 280",
      "city": "Howemouth",
      "zipcode": "58804-1099",
      "geo": {
        "lat": "24.8918",
        "lng": "21.8984"
      }
    },
    "phone": "210.067.6132",
    "website": "elvis.io",
    "company": {
      "name": "Johns Group",
      "catchPhrase": "Configurable multimedia task-force",
      "bs": "generate enterprise e-tailers"
    }
  },
  {
    "id": 8,
    "name": "Nicholas Runolfsdottir V",
    "username": "Maxime_Nienow",
    "email": "Sherwood@rosamond.me",
    "address": {
      "street": "Ellsworth Summit",
      "suite": "Suite 729",
      "city": "Aliyaview",
      "zipcode": "45169",
      "geo": {
        "lat": "-14.3990",
        "lng": "-120.7677"
      }
    },
    "phone": "586.493.6943 x140",
    "website": "jacynthe.com",
    "company": {
      "name": "Abernathy Group",
      "catchPhrase": "Implemented secondary concept",
      "bs": "e-enable extensible e-tailers"
    }
  },
  {
    "id": 9,
    "name": "Glenna Reichert",
    "username": "Delphine",
    "email": "Chaim_McDermott@dana.io",
    "address": {
      "street": "Dayna Park",
      "suite": "Suite 449",
      "city": "Bartholomebury",
      "zipcode": "76495-3109",
      "geo": {
        "lat": "24.6463",
        "lng": "-168.8889"
      }
    },
    "phone": "(775)976-6794 x41206",
    "website": "conrad.com",
    "company": {
      "name": "Yost and Sons",
      "catchPhrase": "Switchable contextually-based project",
      "bs": "aggregate real-time technologies"
    }
  },
  {
    "id": 10,
    "name": "Clementina DuBuque",
    "username": "Moriah.Stanton",
    "email": "Rey.Padberg@karina.biz",
    "address": {
      "street": "Kattie Turnpike",
      "suite": "Suite 198",
      "city": "Lebsackbury",
      "zipcode": "31428-2261",
      "geo": {
        "lat": "-38.2386",
        "lng": "57.2232"
      }
    },
    "phone": "024-648-3804",
    "website": "ambrose.net",
    "company": {
      "name": "Hoeger LLC",
      "catchPhrase": "Centralized empowering task-force",
      "bs": "target end-to-end models"
    }
  }
]
}

这是我的代码。我相信我做错了什么。任何帮助，将不胜感激谢谢。

json_size = 50580490;
MIN_SIZE = 30000000;
data_len = len(file)

get_array_length = len(file["users"])

print("Print data len : ",data_len)
print("Print  Get Array length  : ", get_array_length)

items = []
if isinstance(file, dict):
  print('Valid JSON file found')

  # determine number of files necessary
  split_into_files = math.ceil(json_size/MIN_SIZE)
  print(f'File will be split into {split_into_files} equal parts')

  split_data = [[] for i in range(0,split_into_files)]
  print('split_data : ', split_data)

  starts = [math.floor(i * get_array_length/split_into_files) for i in range(0,split_into_files)]
  starts.append(data_len)
  print('starts : ', starts)

  for i in range(0,split_into_files):
    for n in range(starts[i], starts[i+1]):
      print('The value for N is: ' , n)     
      print("split_data[i] :" , split_data[i])
      #print(file["users"][n])
      split_data[i].append(file["users"][n])
      print(split_data[i])

Answer 1

我的解决方案 可能不是最好的，但它确实有效。

import json
import boto3
import os
import time
import math


# Variable definition.
SESSION_STORAGE = os.environ['JSON_BUCKET']
SESSION = boto3.session.Session()
CURRENT_REGION = SESSION.region_name
S3_CLIENT = boto3.client("s3")
MIN_SIZE = 16000000 # 16 mb

def handler(event, context):
    
    # Instantiate start time.
    start_time = time.time()
    
    # Bucket Name where file was uploaded
    #bucket = event['Records'][0]['s3']['bucket']['name']
    bucket = 'json-upload-bucket' # => testing bucket.
    
    #file_key_name = event['Records'][0]['s3']['object']['key'] # Use this to make it dynamic.
    file_key_name = 'XXXXXXXXX-users.json' # This is for testing only. 
    print("File Key Name", file_key_name)
    
    response = S3_CLIENT.get_object(Bucket=bucket, Key=file_key_name)
    #print("Response : ", response)
    
    # json size 
    json_size = response['ContentLength']
    print("json_size : ", json_size)
    
    # Reading content
    content = response['Body']
    jsonObject = json.loads(content.read())
    data = jsonObject['users']
    data_len = len(jsonObject)
    #print('Length of JSON : ', data_len)
    #print('Order array length : ', len(data))
    
    if isinstance(data, list):
        data_len = len(data)
        print('Valid JSON file found')
        
        if(json_size <=  MIN_SIZE): 
            print('File meets the minimum size.')
        else:
            # determine number of files necessary
            split_into_files = math.ceil(json_size/MIN_SIZE)
            print(f'File will be split into {split_into_files} equal parts')
            
            # initialize 2D array
            split_data = [[] for i in range(0,split_into_files)]
            
            # determine indices of cutoffs in array
            starts = [math.floor(i * data_len/split_into_files) for i in range(0,split_into_files)]
            starts.append(data_len)
            
            # loop through 2D array
            for i in range(0,split_into_files):
                # loop through each range in array
                for n in range(starts[i],starts[i+1]):
                    split_data[i].append(data[n])
                
                
                print(file_key_name.split('.')[0] + '_' + str(i+1) + '.json')
                name = os.path.basename(file_key_name).split('.')[0] + '_' + str(i+1) + '.json'
                print('Name : ', name)
                folder = '/tmp/'+name
                with open(folder, 'w') as outfile:
                    
                    # restructure the json back to its original state.
                    generated_json = {
                        list(jsonObject.keys())[0] : list(jsonObject.values())[0],
                        list(jsonObject.keys())[1] : split_data[i]}
                    json.dump(generated_json, outfile, indent=4)
                    
                S3_CLIENT.upload_file(folder, bucket, name)
                    
                print('Part',str(i+1),'... completed')
            
    else:
        print("JSON is not an Array of Objects")

    return {
        'statusCode': 200,
        'body': json.dumps('JSON split completed checks s3.')
    }

Answer 2

似乎您正在以原始形式拆分数据，这意味着json是分层结构，当您直接拆分数据时，它将无法识别记录，而可能会破坏结构。

您可以先将用户元素读取到其他任何结构中，例如列表/数据框架。

with open('users.json','r') as f:
    user_list = json.load(f)
    users_data = user_list['users']

（您需要从json文件中的用户列表开始读取，因为文件中还有另一列-例如“开始”）

然后，您将在users_data中拥有所有记录，然后根据json记录数可以进行相应拆分。如果要在此过程中增加一些性能以备将来使用-可以对users_data中的记录进行排序，然后将记录拆分为单独的json文件。

如何将大json文件拆分为较小的json文件

2 个答案: