拆分熊猫列值

时间:2016-07-18 13:41:29

标签: python-3.x pandas

我有一个看起来像这样的数据框。我想展平Final_Column数据,这样如果有两个数据(用逗号分隔),则行与下一个重复行中的第二个/第三个/第四个数据重复。

    Query_Name  Market  Details Final_Column
    0   dummy_queryname dummy_market    23.65316176 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    1   dummy_queryname dummy_market    45.80583529 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] 
    2   dummy_queryname dummy_market    51.36167825 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] 
    3   dummy_queryname dummy_market    19.88567955 [{'name': 'relevant_data1', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892601'}, {'name': 'relevant_data2', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892718'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]  relevant_data1,relevant_data2
    4   dummy_queryname dummy_market    35.27507755 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    5   dummy_queryname dummy_market    61.93743196 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    6   dummy_queryname dummy_market    36.13855036 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    7   dummy_queryname dummy_market    14.48145401 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    8   dummy_queryname dummy_market    71.63468683 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    9   dummy_queryname dummy_market    54.11091504 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    10  dummy_queryname dummy_market    75.50864821 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    11  dummy_queryname dummy_market    99.03587932 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    12  dummy_queryname dummy_market    90.00233695 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    13  dummy_queryname dummy_market    71.77914123 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2790116', 'id': '2790148'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] 
    14  dummy_queryname dummy_market    84.74331618 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    15  dummy_queryname dummy_market    22.85314775 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    16  dummy_queryname dummy_market    38.54872031 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    17  dummy_queryname dummy_market    79.04120263 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    18  dummy_queryname dummy_market    92.68911593 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
    19  dummy_queryname dummy_market    34.7022886  [{'name': 'relevant_data3', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892659'}, {'name': 'relevant_data4', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892667'}, {'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]   relevant_data3,relevant_data4

所以它看起来像这样(在4和21重复):

Query_Name  Market  Details Final_Column
0   dummy_queryname dummy_market    23.65316176 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
1   dummy_queryname dummy_market    45.80583529 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] 
2   dummy_queryname dummy_market    51.36167825 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] 
3   dummy_queryname dummy_market    19.88567955 [{'name': 'relevant_data1', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892601'}, {'name': 'relevant_data2', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892718'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]  relevant_data1
4   dummy_queryname dummy_market    19.88567955 [{'name': 'relevant_data1', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892601'}, {'name': 'relevant_data2', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892718'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]  relevant_data2
5   dummy_queryname dummy_market    35.27507755 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
6   dummy_queryname dummy_market    61.93743196 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
7   dummy_queryname dummy_market    36.13855036 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
8   dummy_queryname dummy_market    14.48145401 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
9   dummy_queryname dummy_market    71.63468683 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
10  dummy_queryname dummy_market    54.11091504 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
11  dummy_queryname dummy_market    75.50864821 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
12  dummy_queryname dummy_market    99.03587932 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
13  dummy_queryname dummy_market    90.00233695 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
14  dummy_queryname dummy_market    71.77914123 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2790116', 'id': '2790148'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] 
15  dummy_queryname dummy_market    84.74331618 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
16  dummy_queryname dummy_market    22.85314775 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
17  dummy_queryname dummy_market    38.54872031 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
18  dummy_queryname dummy_market    79.04120263 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
19  dummy_queryname dummy_market    92.68911593 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]    
20  dummy_queryname dummy_market    34.7022886  [{'name': 'relevant_data3', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892659'}, {'name': 'relevant_data4', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892667'}, {'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]   relevant_data3
21  dummy_queryname dummy_market    34.7022886  [{'name': 'relevant_data3', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892659'}, {'name': 'relevant_data4', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892667'}, {'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]   relevant_data4

1 个答案:

答案 0 :(得分:0)

如果我理解你的问题,你想找出'First_Column'部分中有多个相关数据的行,并将这些行乘以该行中存在的相关数据量。

为简单起见,假设您在Final_Column中的词典数据具有结构[{'relevantness': <0 or 1>, 'data': <any number>},...]

我建议你先写一个函数,它将对Final_Column中的每个元素进行操作,并找到相关数据量。 e.g:

def find_relevants(dctlist):
    count = 0
    for it in dctlist:
        relness = it['relevantness'] 
        if relness > 0:
            count = count + 1
    return count

然后,首先使用它来创建另一列,其中包含提取的相关数据计数:

df['relcount'] = df['Final_Column'].map(find_relevants)

之后,只需要通过使用相应的重新安装来复制行来构建另一个数据帧。 (DataFrame docs

在这里,我通过遍历数据框并构建行列表然后使用它们来构建最终的数据帧来完成它。这可能根本就没有效率,所以如果这是您关心的问题之一,您可能需要考虑另一种方法。

假设您已有输入数据框df

# temporary accumulation list initialization
rowlist = []
for ind, row in df.iterrows():
    # df.iterrows() lets you traverse rows as pandas Series.
    count = row['relcount']
    if count > 1:
        # [it]*i -> [it,it,it,...] : has i elements
        rowlist.extend([row] * count)
    else:
        rowlist.append(row)
# reset_index fixes the index duplicates.
finaldf = pandas.DataFrame(rowlist).reset_index()

# we drop the relcount column as we no longer need it
finaldf.drop('relcount',axis=1,inplace=True)
print(finaldf)

编辑1:

阅读完评论后,我做了这些修改。我不知道你是如何获得数据帧的,但这看起来像是在构建数据帧时应该完成的事情。无论如何:

假设df是您预构建的数据帧,您只需要迭代数据帧并再次构造行。可能有更高级别的方法来做到这一点,但这样做真的很简单:

# Accumulator for new rows
rowlist = []
# Iterating over rows
for ind, row in df.iterrows():
    # list in question
    data = row['Final_Column']
    # data count in the list
    datacount = len(data)
    if datacount < 2:
        # we pass the row as is and continue iteration
        rowlist.append(row)
        continue

    for dat in data:
        # make a duplicate
        newrow = row.copy()
        # change the relevant column with extracted data
        newrow['Final_Column'] = dat
        rowlist.append(newrow)

# Create final dataframe
finaldf = pandas.DataFrame(rowlist)

# Reset the index, note that drop=True drops the old index instead of
# creating a new column and moving it there.
finaldf.reset_index(inplace=True, drop=True)

print(finaldf)

附加

在这里,我提供了我编写的完整脚本来测试解决方案:

数据生成

# prep.py
import pandas
import random

headers = "Query_Name Market Details Final_Column".split()

arbnumgen = lambda: random.normalvariate(50,25)

datagen = lambda relev: {'relevantness': relev, 'data': arbnumgen()}

def genfinalcolumn(relcount, irrelcount):
    relevants = [datagen(1) for i in range(relcount)]
    irrelevants = [datagen(0) for i in range(irrelcount)]
    shuffled = relevants + irrelevants
    random.shuffle(shuffled)

    return shuffled


def generate_data():
    dummy_qname = "dummy_qn"
    dummy_market = "dummy_m"

    irrels = [genfinalcolumn(0,1) for i in range(10)]
    rels1 = [genfinalcolumn(1,1) for i in range(3)] # 1 relevant
    rels2 = [genfinalcolumn(2,1) for i in range(4)] # 2 relevant
    rels3 = [genfinalcolumn(2,2) for i in range(3)] # 2 relevant

    data = irrels + rels1 + rels2 + rels3
    random.shuffle(data)

    result = []
    for fincol in data:
        qname = dummy_qname
        market = dummy_market
        details = arbnumgen()
        result.append(dict(zip(headers, (qname,market,details,fincol))))

    return result


def generate_dataframe():
    data = generate_data()
    return pandas.DataFrame.from_records(data)


if __name__ == '__main__':
    df = generate_dataframe()
    df.to_csv('test.csv')

    # from pandas.util.testing import assert_frame_equal
    # parsed = pandas.DataFrame.from_csv('test.csv')
    # assert_frame_equal(parsed, df)

测试解决方案

import pandas

if __name__ == '__main__':
    # Read the dataframe
    typedict = { 'Query_Name' : str
               , 'Market' : str
               , 'Details': float }
    def fincolconverter(string):
        return eval(string)
    df = pandas.read_csv('test.csv',dtype=typedict, index_col=0,
                         converters={'Final_Column': fincolconverter})

    # Accumulator for new rows
    rowlist = []
    # Iterating over rows
    for ind, row in df.iterrows():
        # list in question
        data = row['Final_Column']
        # data count in the list
        datacount = len(data)
        if datacount < 2:
            # we pass the row as is and continue iteration
            rowlist.append(row)
            continue

        for dat in data:
            # make a duplicate
            newrow = row.copy()
            # change the relevant column with extracted data
            newrow['Final_Column'] = dat
            rowlist.append(newrow)

    # Create final dataframe
    finaldf = pandas.DataFrame(rowlist)

    # Reset the index, note that drop=True drops the old index instead of
    # creating a new column and moving it there.
    finaldf.reset_index(inplace=True, drop=True)

    print(finaldf)