我有一个看起来像这样的数据框
id1 id2 attr
------------------
11 a
11 a
11 a
11 b
11 c
22 a
22 a
22 a
22 a
33 d
44 e
我希望它看起来像这样。 id1,id2是计数(频率)。
id1 id2 attr
------------------
2 a
1 a
1 b
1 c
2 a
2 a
1 d
1 e
间隙中没有值,所以如果需要,我可以用NA填充它。我尝试使用聚合函数但无法获得所需的输出。 Thanx的帮助。
答案 0 :(得分:3)
这是您的数据
dat<-structure(list(id1 = c(11L, 11L, NA, NA, NA, 22L, 22L, NA, NA,
33L, 44L), id2 = c(NA, NA, 11L, 11L, 11L, NA, NA, 22L, 22L, NA,
NA), attr = structure(c(1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 4L,
5L), .Label = c("a", "b", "c", "d", "e"), class = "factor")), .Names = c("id1",
"id2", "attr"), class = "data.frame", row.names = c(NA, -11L))
所需的输出并不典型,但这似乎可以使用'plyr'
library(plyr)
#use ddply and count to count the number of instances of each case in each id
temp<-ddply(dat, .(id1, id2), transform,
freq = count(attr))
#only keep unique rows
temp<-unique(temp)
#need to create an id column for whether there is 11,22,33,44 in either id1 or id2
temp$id<-pmax(temp$id1, temp$id2, na.rm=TRUE)
#order the rows into desired order
temp <- temp[order(temp$id, temp$attr),]
#use these ifelse statements to replace id1 and id2
temp$id1<-ifelse(is.na(temp$id1), NA, temp$freq.freq)
temp$id2<-ifelse(is.na(temp$id2), NA, temp$freq.freq)
#just keep variables you want
temp<-temp[c(1,2,3)]
temp
id1 id2 attr
1 2 NA a
7 NA 1 a
8 NA 1 b
9 NA 1 c
3 2 NA a
10 NA 2 a
5 1 NA d
6 1 NA e
答案 1 :(得分:2)
使用@jfreels使用tally
和dplyr
中的dat
library(dplyr)
dat1 <- dat%>%
group_by(id1,id2, attr) %>%
tally()
dat2 <- dat %>%
unique()
left_join(dat2,dat1) %>%
mutate(id1=ifelse(!is.na(id1), n, NA),id2=ifelse(!is.na(id2), n, NA)) %>%
select(-n)
#Joining by: c("id1", "id2", "attr")
# id1 id2 attr
#1 2 NA a
#2 NA 1 a
#3 NA 1 b
#4 NA 1 c
#5 2 NA a
#6 NA 2 a
#7 1 NA d
#8 1 NA e
答案 2 :(得分:1)
此方法的结果并不是您想要的格式,但可能更容易理解。
# load library
library(dplyr)
# your data
dat<-structure(list(id1 = c(11L, 11L, NA, NA, NA, 22L, 22L, NA, NA,33L, 44L), id2 = c(NA, NA, 11L, 11L, 11L, NA, NA, 22L, 22L, NA,NA), attr = structure(c(1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 4L,5L), .Label = c("a", "b", "c", "d", "e"), class = "factor")), .Names = c("id1","id2", "attr"), class = "data.frame", row.names = c(NA, -11L))
# tally counts the number of observations
dat %>%
group_by(id1,id2,attr) %>%
tally
# output
Source: local data frame [8 x 4]
Groups: id1, id2
id1 id2 attr n
1 11 NA a 2
2 22 NA a 2
3 33 NA d 1
4 44 NA e 1
5 NA 11 a 1
6 NA 11 b 1
7 NA 11 c 1
8 NA 22 a 2
答案 3 :(得分:0)
请原谅我可怜的R代码,但为了做出你想做的事,我不得不做非传统的事情。遗憾的是,代码的可扩展性不高。它肯定可以改进,但它提供了示例输出。唯一的区别是假设您的输入值在空白区域中具有NA。
#Concatenate each row to a single value and find the unique rows
unique.pasted<-apply(rawdata[!duplicated(rawdata),],1,paste,collapse="-")
#Concatenate each row
pasted.rows<-apply(rawdata,1,paste,collapse="-")
#Get frequencies and maintain row order
frequencies<-table(pasted.rows)[unique.pasted]
#Separate id1 and id2
id1.freq<-frequencies
id1.freq[is.na(rawdata[!duplicated(rawdata),"id1"])]<-NA
id2.freq<-frequencies
id2.freq[is.na(rawdata[!duplicated(rawdata),"id2"])]<-NA
#Obtain the final table
final.table<-data.frame(id1=id1.freq,id2=id2.freq,attr=rawdata[!duplicated(rawdata),"attr"])
#Remove row names
row.names(final.table)<-NULL
#Replace NA with empty values
final.table[is.na(final.table)]<-""
final.table
id1 id2 attr
1 2 a
2 1 a
3 1 b
4 1 c
5 2 a
6 2 a
7 1 d
8 1 e