我有一个格式如下的CSV文件:
id @ word @ information @ other information
有时,第一列会重复出现:
001 @ cat @ makes a great pet @ mice
002 @ rat @ makes a great friend @ cheese
003 @ dog @ can guard the house @ chicken
004 @ cat @ can jump very high @ fish
您可以看到,第一行和最后一行在第2列中有重复数据。我想删除这些重复项(如果第2列完全相同)并合并第三列中包含的信息以及列中包含的信息四强。结果是这样的:
001 @ cat @ ① makes a great pet ② can jump very high @ ① mice ② fish
002 @ rat @ makes a great friend @ cheese
003 @ dog @ can guard the house @ chicken
如何合并单元格中的数据,以便将第三列中的所有数据组合到一个单元格中,将第四列中的数据组合到一个单元格中?
答案 0 :(得分:5)
所描述的任务相当棘手,没有一些方便的工作就无法完成。使用mouviciel描述的技术,我有一个解决方案。
这是funkychicken.awk:
BEGIN { FS = "@" }
function joinArray(values, sep, len) {
actualSep = "";
for (i = 1; i <= len; i++) {
result = result actualSep values[i];
actualSep = sep;
}
return result;
}
function resetFunkyToken() {
ftok = 0;
}
function funkyToken() {
return "(" ++ftok ")";
}
function trim(text) {
sub(/ *$/, "", text);
return text;
}
{
if ($2 in data) {
resetFunkyToken();
split(data[$2], existingValues, "@");
for (f = 3; f <= 4; f++)
existingValues[f] = " " funkyToken() trim(existingValues[f]) " " funkyToken() $f;
data[$2] = joinArray(existingValues, "@", NF);
}
else {
data[$2] = $0;
}
}
END {
for (item in data)
print data[item];
}
然后按照命令执行带有所述数据的funkychicken.awk并对输出进行排序:
$ awk -f funkychicken.awk data.txt | sort
001 @ cat @ (1) makes a great pet (2) can jump very high @ (3) mice (4) fish
002 @ rat @ makes a great friend @ cheese
003 @ dog @ can guard the house @ chicken
而不是使用你的时髦代币①②③④⑤⑥⑦⑧⑨⑩我去的时候不那么时髦(1)(2)....
答案 1 :(得分:4)
首先,使用sort
对第二列上的行进行排序。
其次,使用awk
输出具有相同第二列的连续行作为单行,第三列和第四列根据需要连接。
答案 2 :(得分:4)
我在红宝石中工作(在bash中这样做会有点痛苦)。
首先我写了一个描述问题的规范:
require 'rubygems'
require 'rspec'
require './chew'
describe 'indentation' do
it "should calculate appropriate padding (minimum 3)" do
indentation(1).should == 3
indentation(99).should == 3
indentation(999).should == 3
indentation(1000).should == 4
indentation(1500).should == 4
indentation(10000).should == 5
end
end
describe 'chew' do
it "should merge duplicate entries in a csv file" do
input = <<-TEXT
001 @ cat @ makes a great pet @ mice
002 @ rat @ makes a great friend @ cheese
003 @ dog @ can guard the house @ chicken
004 @ cat @ can jump very high @ fish
TEXT
output = <<-TEXT
001 @ cat @ (1) makes a great pet (2) can jump very high @ (1) mice (2) fish
002 @ rat @ makes a great friend @ cheese
003 @ dog @ can guard the house @ chicken
TEXT
chew(input).should == output
end
end
这是一个解决方案:
#! /bin/bash/env ruby
def merged_values(values)
return values[0] if values.size == 1
merged = []
values.each_with_index do |value, i|
merged << "(#{i+1}) #{value}"
end
merged.join(" ")
end
def indentation(count)
[3, Math.log10(count) + 1].max.to_i
end
def chew(input)
records = Hash.new {|hash, key| hash[key] = [[],[]]}
input.split(/\n/).each do |row|
row_number, key, first_value, second_value = row.split(/\s*@\s*/)
records[key][0] << first_value
records[key][1] << second_value
records
end
row_number_format = "%0.#{indentation(records.size)}d"
result = ""
records.each_with_index do |record, i|
key, values = record
result << [
row_number_format % (i+1),
key,
merged_values(values[0]),
merged_values(values[1])
].join(" @ ") << "\n"
end
result
end
if $0 == __FILE__
abort "usage: ruby chew.rb input_file" unless ARGV.size == 1
puts chew(File.read(ARGV[0]))
end
我选择了更简单的编号方案,因为如果要合并的值超过50个会发生什么? http://en.wikipedia.org/wiki/Enclosed_alphanumerics
当有大量记录时,我冒昧地增加了左边的填充。
答案 3 :(得分:3)
啊,你想把多个记录压平成一个。我有一个python脚本来做到这一点,available here。该版本设置为将excel文件转换为csv,以及特定于该用例的一些其他内容。对你而言,我会这样做:
import os
import sys
import csv
import argparse
from collections import defaultdict
from itertools import chain, izip_longest
def getunique(reader, uniqueFields, mergeFields):
"""Find all unique rows in the csv file, based on the unique fields given."""
rows = defaultdict(list)
for row in reader:
unique = '|'.join([row[f] for f in reader.fieldnames if f in uniqueFields])
merge = [row[f] for f in reader.fieldnames if f in mergeFields]
rows[unique].append(merge)
return rows
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process an csv file, converting multiple rows to one.', version='%(prog)s 1.0')
parser.add_argument('infile', type=str, help='excel input file')
args = parser.parse_args()
reader = csv.DictReader(open(args.infile, "rb"), dialect='excel')
uniqueFields = []
mergeFields = []
for field in reader.fieldnames:
tmp = raw_input("Is field {0} a: \nunique field? (1)\nignored field? (2)\nmerged field? (3)\n>> ".format(field))
if tmp == '1':
uniqueFields.append(field)
elif tmp == '2':
pass
else:
mergeFields.append(field)
unique = getunique(reader, uniqueFields, mergeFields)
fieldnames = uniqueFields
lengths = [len(merge) for merge in unique.itervalues()]
for i in range(1, max(lengths)+1):
fieldnames.extend(['_'.join((field,str(i))) for field in mergeFields])
writer = csv.DictWriter(open("export.csv", "wb"), fieldnames, dialect='excel')
writer.writeheader()
for unique, merge in unique.iteritems():
currData = unique.split("|")
for drug in merge:
currData.extend(drug)
currRow = izip_longest(fieldnames, currData, fillvalue='')
writer.writerow(dict(currRow))
## clean up and finishing section
del reader
del writer
修改:第二个版本不添加额外字段,并输入所请求的(1)
标记。但是,它隐含地假设id字段被忽略,并替换为(unsorted)字典中的当前条目。当然,这可以改变,但是没有关于哪个ids适合具有相同字段2的行的信息。它还假设id字段被称为id
。
import os
import sys
import csv
import argparse
from collections import defaultdict
from itertools import chain, izip_longest
def getunique(reader, uniqueFields, mergeFields):
"""Find all unique rows in the csv file, based on the unique fields given."""
rows = defaultdict(list)
for row in reader:
unique = '|'.join([row[f] for f in reader.fieldnames if f in uniqueFields])
merge = [(f, row[f]) for f in reader.fieldnames if f in mergeFields]
rows[unique].append(merge)
return rows
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process an csv file, converting multiple rows to one.', version='%(prog)s 1.0')
parser.add_argument('infile', type=str, help='excel input file')
args = parser.parse_args()
reader = csv.DictReader(open(args.infile, "rb"), dialect='excel')
uniqueFields = []
mergeFields = []
for field in reader.fieldnames:
tmp = raw_input("Is field {0} a: \nunique field? (1)\nignored field? (2)\nmerged field? (3)\n>> ".format(field))
if tmp == '1':
uniqueFields.append(field)
elif tmp == '2':
pass
else:
mergeFields.append(field)
unique = getunique(reader, uniqueFields, mergeFields)
writer = csv.DictWriter(open("export.csv", "wb"), reader.fieldnames, dialect='excel')
writer.writeheader()
for rowID, (unique, merge) in enumerate(unique.iteritems()):
currData = defaultdict(list)
for field, data in izip_longest(fieldnames, currData, fillvalue=''):
currData[field].append(data)
for n,data in enumerate(merge):
currData[data[0]].append("({0}) {1}".format(n+1, data[1]))
currData['id'] = str(rowID + 1)
currRow = {}
for key,value in currData.iteritems():
currRow[key] = ''.join(value)
writer.writerow(currRow)
## clean up and finishing section
del reader
del writer
答案 4 :(得分:3)
这可能对您有用:
sort -k3,3 -k1,1n file |
sed ':a;$!N;s/^\(\S*\s\)\(@[^@]*@\)\( +\)*\([^@]*\)@\( +\)*\([^\n]*\)\n\S*\s\2\([^@]*@\)\(.*\)/\1\2 +\4+\7 +\6 +\8/;ta;P;D' |
sort -n |
awk '{for(i=1;i<=NF;i++){if($i=="@")n=0;if($i=="+")$i="("++n")"}}1'
001 @ cat @ (1) makes a great pet (2) can jump very high @ (1) mice (2) fish
002 @ rat @ makes a great friend @ cheese
003 @ dog @ can guard the house @ chicken
说明:
sed
使用+
作为字段标记合并第3列和第4列的相邻行awk
将字段标记转换为数字答案 5 :(得分:1)
这是Ruby中更短的解决方案。 (此脚本需要Ruby 1.9,不适用于Ruby 1.8)
filename = "filename.txt" # change as appropriate
info,other = 2.times.map { Hash.new { |h,k| h[k] = [] }}
ids = {}
File.readlines(filename).each do |line|
id,word,i,o = line.split("@").map(&:strip)
info[word] << i
other[word] << o
ids[word] ||= id
end
ids.sort_by { |k,v| v }.each do |(word,id)|
i = info[word].size > 1 ? (info[word].map.with_index { |x,idx| "(#{idx+1}) #{x}" }.join(" ")) : info[word].first
o = other[word].size > 1 ? (other[word].map.with_index { |x,idx| "(#{idx+1}) #{x}" }.join(" ")) : other[word].first
puts "#{id} @ #{word} @ #{i} @ #{o}"
end
有人评论说,解析CSV文件并不像拆分分隔符那么简单,但您在问题中显示的格式不是CSV。我正在遵循你在问题中展示的格式。