我必须保存三个数组:40000x2x128x128x128、40000x128x128x128和40000x128x128x128到单个hdf5文件。我使用以下代码将值随机分配给每个数组,并将其写入hdf5文件。但是,我这里有两个问题:
该数组太大,则将出现内存错误。由于内存错误,我无法运行所有迭代。我们该如何解决?
输出hdf5太大,大约40GB。我们如何将其加载到RAM?因为每次我只是随机选择数组中的某个位置进行处理
这是我的代码
INDEX_COLUMN = "{urn:schemas-microsoft-com:office:spreadsheet}Index"
CELL_ELEMENT = "Cell"
DATA_ELEMENT = "Data"
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
rows.pop(0)
csv.append(join(",", header))
for row in rows:
values = get_cells_text(row)
csv.append(join(",", values))
return join("\n", csv)
def serialize_xml(xml):
return ET.fromstring(xml)
def get_cells_text(row):
keys = []
cells = normalize_row_cells(row)
for elm in cells:
keys.append(elm[0].text or "")
while len(keys) < 92:
keys.append("")
return keys
def normalize_row_cells(row):
cells = list(row)
updated_cells = copy.deepcopy(cells)
pos = 1
for elm in cells:
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
while index > pos:
empty_elm = ET.Element(CELL_ELEMENT)
child = ET.SubElement(empty_elm, DATA_ELEMENT)
child.text = ""
updated_cells.insert(pos - 1, empty_elm)
pos += 1
pos += 1
return updated_cells