我有一个看起来像这样的数据框。我想展平Final_Column数据,这样如果有两个数据(用逗号分隔),则行与下一个重复行中的第二个/第三个/第四个数据重复。
Query_Name Market Details Final_Column
0 dummy_queryname dummy_market 23.65316176 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
1 dummy_queryname dummy_market 45.80583529 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
2 dummy_queryname dummy_market 51.36167825 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
3 dummy_queryname dummy_market 19.88567955 [{'name': 'relevant_data1', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892601'}, {'name': 'relevant_data2', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892718'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] relevant_data1,relevant_data2
4 dummy_queryname dummy_market 35.27507755 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
5 dummy_queryname dummy_market 61.93743196 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
6 dummy_queryname dummy_market 36.13855036 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
7 dummy_queryname dummy_market 14.48145401 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
8 dummy_queryname dummy_market 71.63468683 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
9 dummy_queryname dummy_market 54.11091504 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
10 dummy_queryname dummy_market 75.50864821 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
11 dummy_queryname dummy_market 99.03587932 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
12 dummy_queryname dummy_market 90.00233695 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
13 dummy_queryname dummy_market 71.77914123 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2790116', 'id': '2790148'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
14 dummy_queryname dummy_market 84.74331618 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
15 dummy_queryname dummy_market 22.85314775 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
16 dummy_queryname dummy_market 38.54872031 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
17 dummy_queryname dummy_market 79.04120263 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
18 dummy_queryname dummy_market 92.68911593 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
19 dummy_queryname dummy_market 34.7022886 [{'name': 'relevant_data3', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892659'}, {'name': 'relevant_data4', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892667'}, {'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] relevant_data3,relevant_data4
所以它看起来像这样(在4和21重复):
Query_Name Market Details Final_Column
0 dummy_queryname dummy_market 23.65316176 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
1 dummy_queryname dummy_market 45.80583529 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
2 dummy_queryname dummy_market 51.36167825 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
3 dummy_queryname dummy_market 19.88567955 [{'name': 'relevant_data1', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892601'}, {'name': 'relevant_data2', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892718'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] relevant_data1
4 dummy_queryname dummy_market 19.88567955 [{'name': 'relevant_data1', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892601'}, {'name': 'relevant_data2', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892718'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] relevant_data2
5 dummy_queryname dummy_market 35.27507755 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
6 dummy_queryname dummy_market 61.93743196 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
7 dummy_queryname dummy_market 36.13855036 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
8 dummy_queryname dummy_market 14.48145401 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
9 dummy_queryname dummy_market 71.63468683 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
10 dummy_queryname dummy_market 54.11091504 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
11 dummy_queryname dummy_market 75.50864821 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
12 dummy_queryname dummy_market 99.03587932 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
13 dummy_queryname dummy_market 90.00233695 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
14 dummy_queryname dummy_market 71.77914123 [{'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2790116', 'id': '2790148'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
15 dummy_queryname dummy_market 84.74331618 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
16 dummy_queryname dummy_market 22.85314775 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
17 dummy_queryname dummy_market 38.54872031 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
18 dummy_queryname dummy_market 79.04120263 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
19 dummy_queryname dummy_market 92.68911593 [{'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}]
20 dummy_queryname dummy_market 34.7022886 [{'name': 'relevant_data3', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892659'}, {'name': 'relevant_data4', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892667'}, {'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] relevant_data3
21 dummy_queryname dummy_market 34.7022886 [{'name': 'relevant_data3', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892659'}, {'name': 'relevant_data4', 'parentName': 'relevant_scrape', 'parentId': '2892458', 'id': '2892667'}, {'name': 'dummy_data', 'parentName': 'irrelevant_scrape', 'parentId': '2662610', 'id': '2684157'}, {'name': 'dummy_data', 'parentName': 'dummy_data', 'parentId': '2517840', 'id': '2565351'}] relevant_data4
答案 0 :(得分:0)
如果我理解你的问题,你想找出'First_Column'部分中有多个相关数据的行,并将这些行乘以该行中存在的相关数据量。
为简单起见,假设您在Final_Column中的词典数据具有结构[{'relevantness': <0 or 1>, 'data': <any number>},...]
我建议你先写一个函数,它将对Final_Column中的每个元素进行操作,并找到相关数据量。 e.g:
def find_relevants(dctlist):
count = 0
for it in dctlist:
relness = it['relevantness']
if relness > 0:
count = count + 1
return count
然后,首先使用它来创建另一列,其中包含提取的相关数据计数:
df['relcount'] = df['Final_Column'].map(find_relevants)
之后,只需要通过使用相应的重新安装来复制行来构建另一个数据帧。 (DataFrame docs)
在这里,我通过遍历数据框并构建行列表然后使用它们来构建最终的数据帧来完成它。这可能根本就没有效率,所以如果这是您关心的问题之一,您可能需要考虑另一种方法。
假设您已有输入数据框df
:
# temporary accumulation list initialization
rowlist = []
for ind, row in df.iterrows():
# df.iterrows() lets you traverse rows as pandas Series.
count = row['relcount']
if count > 1:
# [it]*i -> [it,it,it,...] : has i elements
rowlist.extend([row] * count)
else:
rowlist.append(row)
# reset_index fixes the index duplicates.
finaldf = pandas.DataFrame(rowlist).reset_index()
# we drop the relcount column as we no longer need it
finaldf.drop('relcount',axis=1,inplace=True)
print(finaldf)
阅读完评论后,我做了这些修改。我不知道你是如何获得数据帧的,但这看起来像是在构建数据帧时应该完成的事情。无论如何:
假设df
是您预构建的数据帧,您只需要迭代数据帧并再次构造行。可能有更高级别的方法来做到这一点,但这样做真的很简单:
# Accumulator for new rows
rowlist = []
# Iterating over rows
for ind, row in df.iterrows():
# list in question
data = row['Final_Column']
# data count in the list
datacount = len(data)
if datacount < 2:
# we pass the row as is and continue iteration
rowlist.append(row)
continue
for dat in data:
# make a duplicate
newrow = row.copy()
# change the relevant column with extracted data
newrow['Final_Column'] = dat
rowlist.append(newrow)
# Create final dataframe
finaldf = pandas.DataFrame(rowlist)
# Reset the index, note that drop=True drops the old index instead of
# creating a new column and moving it there.
finaldf.reset_index(inplace=True, drop=True)
print(finaldf)
在这里,我提供了我编写的完整脚本来测试解决方案:
# prep.py
import pandas
import random
headers = "Query_Name Market Details Final_Column".split()
arbnumgen = lambda: random.normalvariate(50,25)
datagen = lambda relev: {'relevantness': relev, 'data': arbnumgen()}
def genfinalcolumn(relcount, irrelcount):
relevants = [datagen(1) for i in range(relcount)]
irrelevants = [datagen(0) for i in range(irrelcount)]
shuffled = relevants + irrelevants
random.shuffle(shuffled)
return shuffled
def generate_data():
dummy_qname = "dummy_qn"
dummy_market = "dummy_m"
irrels = [genfinalcolumn(0,1) for i in range(10)]
rels1 = [genfinalcolumn(1,1) for i in range(3)] # 1 relevant
rels2 = [genfinalcolumn(2,1) for i in range(4)] # 2 relevant
rels3 = [genfinalcolumn(2,2) for i in range(3)] # 2 relevant
data = irrels + rels1 + rels2 + rels3
random.shuffle(data)
result = []
for fincol in data:
qname = dummy_qname
market = dummy_market
details = arbnumgen()
result.append(dict(zip(headers, (qname,market,details,fincol))))
return result
def generate_dataframe():
data = generate_data()
return pandas.DataFrame.from_records(data)
if __name__ == '__main__':
df = generate_dataframe()
df.to_csv('test.csv')
# from pandas.util.testing import assert_frame_equal
# parsed = pandas.DataFrame.from_csv('test.csv')
# assert_frame_equal(parsed, df)
import pandas
if __name__ == '__main__':
# Read the dataframe
typedict = { 'Query_Name' : str
, 'Market' : str
, 'Details': float }
def fincolconverter(string):
return eval(string)
df = pandas.read_csv('test.csv',dtype=typedict, index_col=0,
converters={'Final_Column': fincolconverter})
# Accumulator for new rows
rowlist = []
# Iterating over rows
for ind, row in df.iterrows():
# list in question
data = row['Final_Column']
# data count in the list
datacount = len(data)
if datacount < 2:
# we pass the row as is and continue iteration
rowlist.append(row)
continue
for dat in data:
# make a duplicate
newrow = row.copy()
# change the relevant column with extracted data
newrow['Final_Column'] = dat
rowlist.append(newrow)
# Create final dataframe
finaldf = pandas.DataFrame(rowlist)
# Reset the index, note that drop=True drops the old index instead of
# creating a new column and moving it there.
finaldf.reset_index(inplace=True, drop=True)
print(finaldf)