所以我正在从一个目录中读取一堆.xlsx文件,我想将它们转换成一个表。
很简单,但我遇到的问题是这些excel文件没有相同的标题。如何创建将检查excel文件头的代码,并将其附加到具有相同列的表,或者如果列的格式不存在则创建新的代码。
我的代码:
import sqlite3 as sql
import pandas as pd
import os
def obtain_data(filename, db):
connect = sql.connect('filepath.sqlite3')
workbook = pd.ExcelFile('filepath' + filename)
df = workbook.parse('Sheet1')
new_db = db.append(df)
print(new_db)
new_db = new_db.rename(columns={'INDEX': 'INDX'})
connect.close()
return new_db
usable_files = []
for filename in os.listdir('filepath'):
if filename.endswith(".xlsx"):
print(filename)
usable_files.append(filename)
else:
print('no')
print(filename)
new_db = pd.DataFrame()
for file in usable_files:
new_db= new_db.append(obtain_data(file, new_db))
注意,我不知道excel文件是否会有匹配的列对。提前谢谢。
更新:
def obtain_data(filename, connect, data):
workbook = pd.ExcelFile('filpath' + filename)
df = workbook.parse('Sheet1')
df = df.rename(columns={'INDEX': 'INDX'})
headers = new_db.dtypes.index
header_list = str(headers.tolist())
header_list = ''.join(header_list)
hash_t = str(hashlib.md5(header_list.encode('utf-8')).hexdigest())
if hash_t not in hash_list:
x = pd.DataFrame(df)
print(x.name)
x.name = hash_t
print(x.name)
hash_list.append(hash_t)
data_frames = data.append(x)
connect.close()
elif hash_t in hash_list:
print('hash is repeating. Find a way to make this code get table name.')
print(filename + ' has been completed succesfully.')
final_results = {'df': df, 'hash_t': hash_t}
return final_results
usable_files = []
for filename in os.listdir('filepath'):
if filename.endswith(".xlsx"):
usable_files.append(filename)
else:
print('cool')
hash_list = []
data_frames = []
new_db = pd.DataFrame()
for file in usable_files:
connect = sql.connect('filepath_test.sqlite3')
x = new_db.append(obtain_data(file, connect, data_frames),
ignore_index=True)
if x['hash_t'] not in hash_list:
new_db = new_db.append(x['df'])
new_db.append(x['hash_t'])
else:
new_db = new_db.append(x['df'])
print(new_db)
connect.commit()
connect.close()
答案 0 :(得分:1)
不确定这是否正是您所需要的,但请查看。如果您的数据框具有共同的列名称,它们将合并在一起,从而生成包含两个数据框中所有列的新数据框,并且任何重叠的条目名称将合并为一行(我不确定如果这是您想要的那样)编辑:有关此示例,请参阅输出中两个Tom
的组合方式。
如果两个数据帧没有任何共同的列,则它们只是连接在一起,从而产生一个包含两个数据帧列的数据帧,但不会合并重叠的条目名称。
我已经包含一个(相当长的)打印输出,以便更清楚地发生了什么。
import pandas as pd
def merge_dataframes(merge_this_df, with_this_df):
print "-----------------------------------------------------"
print "Merging this:"
print merge_this_df
print "\nWith this:"
print with_this_df
print "\nResult:"
# Check if they have common columns
any_common_columns = any([column_name in merge_this_df.columns for column_name in with_this_df.columns])
if any_common_columns:
merged_df = merge_this_df.merge(with_this_df, how="outer")
print merged_df
return merged_df
else:
concatenated_df = pd.concat([merge_this_df, with_this_df])
print concatenated_df
return concatenated_df
# Create some dummy data
df = pd.DataFrame({
"name": ["Tom", "David", "Helen"],
"age": ["30", "40", "50"]
})
df2 = pd.DataFrame({
"name": ["Tom", "Juan", "Maria", "Julia"],
"occupation": ["Plumber", "Chef", "Astronaut", "Teacher"],
})
df3 = pd.DataFrame({
"animal": ["Cat", "Platypus"],
"food": ["Catfoot", "Platypus-food"]
})
# Collect all dummy data in a list
all_dfs = [df, df2, df3]
# Merge or concatenate all dataframes in to a single dataframe
final_df = pd.DataFrame()
for dataframe in all_dfs:
final_df = merge_dataframes(final_df, dataframe)
打印输出:
-----------------------------------------------------
Merging this:
Empty DataFrame
Columns: []
Index: []
With this:
age name
0 30 Tom
1 40 David
2 50 Helen
Result:
age name
0 30 Tom
1 40 David
2 50 Helen
-----------------------------------------------------
Merging this:
age name
0 30 Tom
1 40 David
2 50 Helen
With this:
name occupation
0 Tom Plumber
1 Juan Chef
2 Maria Astronaut
3 Julia Teacher
Result:
age name occupation
0 30 Tom Plumber
1 40 David NaN
2 50 Helen NaN
3 NaN Juan Chef
4 NaN Maria Astronaut
5 NaN Julia Teacher
-----------------------------------------------------
Merging this:
age name occupation
0 30 Tom Plumber
1 40 David NaN
2 50 Helen NaN
3 NaN Juan Chef
4 NaN Maria Astronaut
5 NaN Julia Teacher
With this:
animal food
0 Cat Catfoot
1 Platypus Platypus-food
Result:
age animal food name occupation
0 30 NaN NaN Tom Plumber
1 40 NaN NaN David NaN
2 50 NaN NaN Helen NaN
3 NaN NaN NaN Juan Chef
4 NaN NaN NaN Maria Astronaut
5 NaN NaN NaN Julia Teacher
0 NaN Cat Catfoot NaN NaN
1 NaN Platypus Platypus-food NaN NaN
EDIT2:另一种方法:将sqlite数据库读入pandas dataframe - >修复与列相关的内容 - >将pandas dataframe写入sqlite数据库(覆盖前一个数据库):
import sqlite3 as sql
import pandas as pd
import os
def obtain_data(df_to_add):
# Connect to database
connect = sql.connect("my_database.sqlite")
print "--------------------------------------"
# Read current database into a dataframe
try:
current_df = pd.read_sql_query("SELECT * FROM my_database", connect)
print "Database currently looks like:"
print current_df
# Now, we check if we have overlapping column names in our database and our dataframe
if any([c in current_df.columns for c in df_to_add.columns]):
# If we do, we can merge them
new_df = current_df.merge(df_to_add, how="outer")
else:
# If there are no common columns, we just concatenate them
new_df = pd.concat([current_df, df_to_add])
# Now, we simply overwrite the DB with our current dataframe
print "Adding to database"
new_df.to_sql("my_database", connect, if_exists="replace", index=False)
# For good measure, read database again and print it out
database_df = pd.read_sql_query("SELECT * FROM my_database", connect)
print "Database now looks like:"
print database_df
connect.close()
except pd.io.sql.DatabaseError:
# There's no database called my_database, so simply insert our dataframe
print "Creating initial database named my_database"
df_to_add.to_sql("my_database", connect, index=False)
print "Current database:"
print df_to_add
# We're done here
connect.close()
return
# Create some dummy data
df1 = pd.DataFrame({
"name": ["Tom", "David", "Helen"],
"age": ["30", "40", "50"]
})
df2 = pd.DataFrame({
"name": ["Tom", "Juan", "Maria", "Julia"],
"occupation": ["Plumber", "Chef", "Astronaut", "Teacher"],
})
df3 = pd.DataFrame({
"animal": ["Cat", "Platypus"],
"food": ["Catfoot", "Platypus-food"]
})
# Read all dummy data into the database
for df in [df1, df2, df3]:
obtain_data(df)
输出:
--------------------------------------
Creating initial database named my_database
Current database:
age name
0 30 Tom
1 40 David
2 50 Helen
--------------------------------------
Database currently looks like:
age name
0 30 Tom
1 40 David
2 50 Helen
Adding to database
Database now looks like:
age name occupation
0 30 Tom Plumber
1 40 David None
2 50 Helen None
3 None Juan Chef
4 None Maria Astronaut
5 None Julia Teacher
--------------------------------------
Database currently looks like:
age name occupation
0 30 Tom Plumber
1 40 David None
2 50 Helen None
3 None Juan Chef
4 None Maria Astronaut
5 None Julia Teacher
Adding to database
Database now looks like:
age animal food name occupation
0 30 None None Tom Plumber
1 40 None None David None
2 50 None None Helen None
3 None None None Juan Chef
4 None None None Maria Astronaut
5 None None None Julia Teacher
6 None Cat Catfoot None None
7 None Platypus Platypus-food None None
如果这不是你想要的,请告诉我。