Question

所以我正在从一个目录中读取一堆.xlsx文件，我想将它们转换成一个表。

很简单，但我遇到的问题是这些excel文件没有相同的标题。如何创建将检查excel文件头的代码，并将其附加到具有相同列的表，或者如果列的格式不存在则创建新的代码。

我的代码：

import sqlite3 as sql
import pandas as pd
import os


def obtain_data(filename, db):
    connect = sql.connect('filepath.sqlite3')
    workbook = pd.ExcelFile('filepath' + filename)
    df = workbook.parse('Sheet1')
    new_db = db.append(df)
    print(new_db)
    new_db = new_db.rename(columns={'INDEX': 'INDX'})
    connect.close()
    return new_db


usable_files = []
for filename in os.listdir('filepath'):
    if filename.endswith(".xlsx"):
        print(filename)
        usable_files.append(filename)
    else:
        print('no')
        print(filename)

new_db = pd.DataFrame()
for file in usable_files:
    new_db= new_db.append(obtain_data(file, new_db))

注意，我不知道excel文件是否会有匹配的列对。提前谢谢。

更新：

def obtain_data(filename, connect, data):
    workbook = pd.ExcelFile('filpath' + filename)
    df = workbook.parse('Sheet1')
    df = df.rename(columns={'INDEX': 'INDX'})
    headers = new_db.dtypes.index
    header_list = str(headers.tolist())
    header_list = ''.join(header_list)
    hash_t = str(hashlib.md5(header_list.encode('utf-8')).hexdigest())
    if hash_t not in hash_list:
        x = pd.DataFrame(df)
        print(x.name)
        x.name = hash_t
        print(x.name)
        hash_list.append(hash_t)
        data_frames = data.append(x)
        connect.close()
    elif hash_t in hash_list:
        print('hash is repeating. Find a way to make this code get table name.')
    print(filename + ' has been completed succesfully.')
    final_results = {'df': df, 'hash_t': hash_t}
    return final_results


usable_files = []
for filename in os.listdir('filepath'):
    if filename.endswith(".xlsx"):
        usable_files.append(filename)
    else:
        print('cool')


hash_list = []
data_frames = []
new_db = pd.DataFrame()
for file in usable_files:
    connect = sql.connect('filepath_test.sqlite3')
    x = new_db.append(obtain_data(file, connect, data_frames), 
ignore_index=True)
    if x['hash_t'] not in hash_list:
        new_db = new_db.append(x['df'])
        new_db.append(x['hash_t'])
    else:
        new_db = new_db.append(x['df'])
    print(new_db)
    connect.commit()
    connect.close()

Answer 1

不确定这是否正是您所需要的，但请查看。如果您的数据框具有共同的列名称，它们将合并在一起，从而生成包含两个数据框中所有列的新数据框，并且任何重叠的条目名称将合并为一行（我不确定如果这是您想要的那样）编辑：有关此示例，请参阅输出中两个Tom的组合方式。

如果两个数据帧没有任何共同的列，则它们只是连接在一起，从而产生一个包含两个数据帧列的数据帧，但不会合并重叠的条目名称。

我已经包含一个（相当长的）打印输出，以便更清楚地发生了什么。

import pandas as pd

def merge_dataframes(merge_this_df, with_this_df):
    print "-----------------------------------------------------"
    print "Merging this:"
    print merge_this_df
    print "\nWith this:"
    print with_this_df
    print "\nResult:"

    # Check if they have common columns
    any_common_columns = any([column_name in merge_this_df.columns for column_name in with_this_df.columns])
    if any_common_columns:
        merged_df = merge_this_df.merge(with_this_df, how="outer")
        print merged_df
        return merged_df
    else:
        concatenated_df = pd.concat([merge_this_df, with_this_df])
        print concatenated_df
        return concatenated_df

# Create some dummy data
df = pd.DataFrame({
    "name": ["Tom", "David", "Helen"],
    "age": ["30", "40", "50"]
})

df2 = pd.DataFrame({
    "name": ["Tom", "Juan", "Maria", "Julia"],
    "occupation": ["Plumber", "Chef", "Astronaut", "Teacher"],
})

df3 = pd.DataFrame({
    "animal": ["Cat", "Platypus"],
    "food": ["Catfoot", "Platypus-food"]
})

# Collect all dummy data in a list
all_dfs = [df, df2, df3]

# Merge or concatenate all dataframes in to a single dataframe
final_df = pd.DataFrame()
for dataframe in all_dfs:
    final_df = merge_dataframes(final_df, dataframe)

打印输出：

-----------------------------------------------------
Merging this:
Empty DataFrame
Columns: []
Index: []

With this:
  age   name
0  30    Tom
1  40  David
2  50  Helen

Result:
  age   name
0  30    Tom
1  40  David
2  50  Helen
-----------------------------------------------------
Merging this:
  age   name
0  30    Tom
1  40  David
2  50  Helen

With this:
    name occupation
0    Tom    Plumber
1   Juan       Chef
2  Maria  Astronaut
3  Julia    Teacher

Result:
   age   name occupation
0   30    Tom    Plumber
1   40  David        NaN
2   50  Helen        NaN
3  NaN   Juan       Chef
4  NaN  Maria  Astronaut
5  NaN  Julia    Teacher
-----------------------------------------------------
Merging this:
   age   name occupation
0   30    Tom    Plumber
1   40  David        NaN
2   50  Helen        NaN
3  NaN   Juan       Chef
4  NaN  Maria  Astronaut
5  NaN  Julia    Teacher

With this:
     animal           food
0       Cat        Catfoot
1  Platypus  Platypus-food

Result:
   age    animal           food   name occupation
0   30       NaN            NaN    Tom    Plumber
1   40       NaN            NaN  David        NaN
2   50       NaN            NaN  Helen        NaN
3  NaN       NaN            NaN   Juan       Chef
4  NaN       NaN            NaN  Maria  Astronaut
5  NaN       NaN            NaN  Julia    Teacher
0  NaN       Cat        Catfoot    NaN        NaN
1  NaN  Platypus  Platypus-food    NaN        NaN

EDIT2：另一种方法：将sqlite数据库读入pandas dataframe - ＆gt;修复与列相关的内容 - ＆gt;将pandas dataframe写入sqlite数据库（覆盖前一个数据库）：

import sqlite3 as sql
import pandas as pd
import os

def obtain_data(df_to_add):
    # Connect to database
    connect = sql.connect("my_database.sqlite")
    print "--------------------------------------"
    # Read current database into a dataframe
    try:
        current_df = pd.read_sql_query("SELECT * FROM my_database", connect)

        print "Database currently looks like:"
        print current_df
        # Now, we check if we have overlapping column names in our database and our dataframe
        if any([c in current_df.columns for c in df_to_add.columns]):
            # If we do, we can merge them
            new_df = current_df.merge(df_to_add, how="outer")
        else:
            # If there are no common columns, we just concatenate them
            new_df = pd.concat([current_df, df_to_add])
        # Now, we simply overwrite the DB with our current dataframe
        print "Adding to database"
        new_df.to_sql("my_database", connect, if_exists="replace", index=False)
        # For good measure, read database again and print it out
        database_df = pd.read_sql_query("SELECT * FROM my_database", connect)
        print "Database now looks like:"
        print database_df
        connect.close()
    except pd.io.sql.DatabaseError:
        # There's no database called my_database, so simply insert our dataframe
        print "Creating initial database named my_database"
        df_to_add.to_sql("my_database", connect, index=False)
        print "Current database:"
        print df_to_add
        # We're done here
        connect.close()
        return


# Create some dummy data
df1 = pd.DataFrame({
    "name": ["Tom", "David", "Helen"],
    "age": ["30", "40", "50"]
})

df2 = pd.DataFrame({
    "name": ["Tom", "Juan", "Maria", "Julia"],
    "occupation": ["Plumber", "Chef", "Astronaut", "Teacher"],
})

df3 = pd.DataFrame({
    "animal": ["Cat", "Platypus"],
    "food": ["Catfoot", "Platypus-food"]
})

# Read all dummy data into the database
for df in [df1, df2, df3]:
    obtain_data(df)

输出：

--------------------------------------
Creating initial database named my_database
Current database:
  age   name
0  30    Tom
1  40  David
2  50  Helen
--------------------------------------
Database currently looks like:
  age   name
0  30    Tom
1  40  David
2  50  Helen
Adding to database
Database now looks like:
    age   name occupation
0    30    Tom    Plumber
1    40  David       None
2    50  Helen       None
3  None   Juan       Chef
4  None  Maria  Astronaut
5  None  Julia    Teacher
--------------------------------------
Database currently looks like:
    age   name occupation
0    30    Tom    Plumber
1    40  David       None
2    50  Helen       None
3  None   Juan       Chef
4  None  Maria  Astronaut
5  None  Julia    Teacher
Adding to database
Database now looks like:
    age    animal           food   name occupation
0    30      None           None    Tom    Plumber
1    40      None           None  David       None
2    50      None           None  Helen       None
3  None      None           None   Juan       Chef
4  None      None           None  Maria  Astronaut
5  None      None           None  Julia    Teacher
6  None       Cat        Catfoot   None       None
7  None  Platypus  Platypus-food   None       None

如果这不是你想要的，请告诉我。

Python Pandas DataFrame匹配表或创建新表

1 个答案: