我正在一个项目中,我想在arangodb中导入大量数据。所有数据均以.xlsx形式包含多个工作表。因此,我编写了一个脚本,将.xlsx文件转换为json文件(每个工作表一个json文件),然后与arango db建立连接并在arangodb中批量导入数据。因此,我用安装在最新版Anaconda版本的Windows pc上的jupyter笔记本中编写了脚本,该脚本可以与本地或远程数据库连接一起使用。因此,在看到代码可以正常工作之后,我将脚本复制到了CentOS7虚拟服务器上并运行它,然后崩溃了。我在装有ubuntu 19.10的物理机上运行它,它也崩溃了。两台linux机器都进行了更新,并且可以与最新版本的Anaconda一起使用。此外,脚本还在所有计算机(Windows和Linux)中从jupyter Notebook作为.py文件和.ipynb文件在命令行中运行。在Windows中,它在Linux中完美运行,当它开始转换第一个.xlsx文件时崩溃。该脚本的代码是这样的:
from zipfile import ZipFile
from bs4 import BeautifulSoup
import pandas as pd
from xlsx2csv import Xlsx2csv as x2csv
import os
import hashlib
import json
import numpy as np
from arango import ArangoClient
import glob
filelist = []
hash_dict = {}
current_folder = os.getcwd()
for file in os.listdir(current_folder):
if file.endswith(".xlsx"):
filelist.append(file)
#create a list of all worksheets contained in the worksheet
def create_sheet_list(file):
with ZipFile(file) as zipped_file:
summary = zipped_file.open(r'xl/workbook.xml').read()
soup = BeautifulSoup(summary, "xml")
sheets = [sheet.get("name") for sheet in soup.find_all("sheet")]
return sheets
#create an array of dataframes from all the worksheets
def create_dataframes(file):
xl = pd.ExcelFile(file)
xl.sheet_names
dfs = {sheet: xl.parse(sheet) for sheet in xl.sheet_names}
return dfs
def create_json(file,sheets,dfs):
print(("The file contains {} sheets").format(len(sheets)))
count = 0
for i in sheets:
json_filelist = []
count = count + 1
#produce the dataframe and check if there are any encoding errors
try:
df = dfs[i]
new_header = df.iloc[0]
df = df[1:]
df.columns = new_header
df = df.fillna(0)
hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
values = str(hash_str)
df['Hash']=np.nan
df['Hash']= df['Hash'].fillna(value=values)
#hash_dict.update({hash_str_name : values})
hash_dict[hash_str_name] = values
json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
#For the dataframes that will get an error because of encoding a different way of conversion will be used
except UnicodeEncodeError:
x2csv(file, outputencoding="utf-8").convert(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')),count)
df = pd.read_csv(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')), header = 1)
hash_str_name = file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_'))
hash_str=int(hashlib.sha1(hash_str_name.encode('utf-8')).hexdigest(), 16) % (10 ** 10)
values = str(hash_str)
df['Hash']=np.nan
df['Hash']= df['Hash'].fillna(value=values)
#hash_dict.update({hash_str_name : values})
hash_dict[hash_str_name] = values
json_file = df.reset_index().to_json(new_path+"/"+file.strip('.xlsx')+("_{}.json").format(i.replace(' ','_')), orient = "records")
os.remove(file.strip('.xlsx')+('{}.csv').format(i.replace(' ','_')))
#Create connection with the Database
def create_db_connection():
client = ArangoClient(hosts='http://127.0.0.1:8529')
db = client.db('CEM', username='root', password='123456')
return db
#Get the list of the .json files from all the folders
def list_of_json():
path = os.getcwd()
folders = os.listdir(path)
json_names = []
for folder in folders:
files = glob.glob(path+"/"+folder+"/"+"*.json")
if len(files)>0:
json_names.append(files)
return json_names
#Get the list of the collections in the database
def list_of_collections(sheets,db):
for col in sheets:
col = col.replace(' ','_')
if db.has_collection(col):
collect = db.collection(col)
else:
collect = db.create_collection(col)
collection = db.collections()
collection = [i['name'] for i in collection if i['name'][0].isupper()]
return collection
#Import the data from the .json files to the appropriate collections
def import_data(json_names,collection, db):
for x in json_names:
for y in x:
for z in collection:
with open(y, "r") as json_file:
if y.endswith("{}.json".format(z)):
data = json.load(json_file)
z = db.collection(z)
z.import_bulk(data)
for file in filelist:
try:
#create the folder where the .json files from that UFED will be stored
new_folder = os.mkdir(os.getcwd()+"/"+file.strip('.xlsx'))
#get the path for the new folder
new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
except FileExistsError:
#if the folder already exists just get its path
new_path = "{0}/{1}".format(os.getcwd(), file.strip('.xlsx'))
print(new_path)
#print the name of the file that's being analyzed so that we have a measure of progress
print(("Now I am working with {} file").format(file))
#call the functions and run the program
create_sheet_list(file)
create_dataframes(file)
sheets = create_sheet_list(file)
dfs = create_dataframes(file)
create_json(file,sheets,dfs)
df_dict = pd.DataFrame(list(hash_dict.items()), index = None, columns = ["File_name", "Hash_num"])
df_dict.to_json(current_folder+"/hash_list.json", orient = "records")
create_db_connection()
db = create_db_connection()
#create_collections(sheets,db)
list_of_json()
json_names = list_of_json()
list_of_collections(sheets,db)
collection = list_of_collections(sheets,db)
import_data(json_names,collection,db)
有人可以帮忙吗?