我有一个如下数据框
df<- structure(list(s1 = structure(1:3, .Label = c("3-4", "4-1", "5-4"
), class = "factor"), s2 = structure(1:3, .Label = c("2-4", "3-15",
"7-16"), class = "factor")), .Names = c("s1", "s2"), row.names = c(NA,
-3L), class = "data.frame")
如下所示
> df
# s1 s2
#1 3-4 2-4
#2 4-1 3-15
#3 5-4 7-16
我想要做的是首先搜索并找到之后类似的值 - 例如,这里4位于s1的第一行,s2的第一行和s1的第三行
- 第二列表示找到这些值的次数
- 第三列显示其中有多少来自df的第一列
- 第四列显示了其中有多少来自df的第二列
- 第五个是来自第一列的字符串
- 第六个是来自第二列的字符串
输出看起来像这样
Value repeated s1N s1N ss1 ss2
4 3 2 1 3,5 2
1 1 1 - 4 -
15 1 - 1 - 3
16 1 - 1 - 7
答案 0 :(得分:1)
您需要做的第一件事是从字符串中提取数字。运行:
newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)
在第一行中分割字符串,并在第二行中将它们转换为数值。结果是一个三维矩阵,您可以使用它来提取您的值。
首先创建一个新的数据框:
#length of the columns in the new frame = number of unique values
dflength <- length(unique(array(newdf[2,,])))
dfout <- data.frame(Value=rep(0,dflength),repeated=rep(0,dflength),s1N=rep(0,dflength),s2N=rep(0,dflength),ss1=rep(0,dflength),ss2=rep(0,dflength))
最明显的方式(但可能不是最有效的方法)就是循环匹配你需要的任何东西:
dfout$Value <- unique(array(newdf[2,,]))
for(i in 1:dflength){
getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout$Value[i])
dfout$repeated[i] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
dfout$s1N[i] <- as.data.frame(table(newdf[2,,1]))$Freq[getID]
if(is.na(dfout$s1N[i])){
dfout$s1N[i] <- 0
}
dfout$s2N[i] <- as.data.frame(table(newdf[2,,2]))$Freq[getID]
if(is.na(dfout$s2N[i])){
dfout$s2N[i] <- 0
}
getID <- which(newdf[2,,1]==dfout$Value[i])
if(length(getID)>0){
dfout$ss1[i] <- toString(newdf[1,,1][getID])
} else {
dfout$ss1[i] <- 0
}
getID <- which(newdf[2,,2]==dfout$Value[i])
if(length(getID)>0){
dfout$ss2[i] <- toString(newdf[1,,2][getID])
} else {
dfout$ss2[i] <- 0
}
}
dfout
# Value repeated s1N s2N ss1 ss2
#1 4 3 2 1 3, 5 2
#2 1 1 1 1 4 0
#3 15 1 0 1 0 3
#4 16 1 0 0 0 7
编辑以循环n个值的值
newdfstring <- apply(df,1:2, function(v) unlist(strsplit(v,"-")))
newdf <- apply(newdfstring,1:3, as.numeric)
dflength <- length(unique(array(newdf[2,,])))
#find the number of s variables
slength <- length(newdf[1,1,])
#create a matrix of appropriate size
dfout <- matrix(data=NA,nrow=dflength,ncol=(2+2*slength))
#create a (near)-empty names array, we will fill it in later
names <- c("Value","repeated",rep("",2*slength))
#fill in the Values column
dfout[,1] <- unique(array(newdf[2,,]))
#loop for every s variable
for(j in 1:slength){
#get their names, paste N or s and add them to the names array
names[2+j] <- paste(names(df)[j],"N",sep="")
names[2+j+slength] <- paste("s",names(df)[j],sep="")
#loop to get the other values
for(i in 1:dflength){
getID <- which(as.data.frame(table(newdf[2,,]))$Var1==dfout[i,1])
dfout[i,2] <- as.data.frame(table(newdf[2,,]))$Freq[getID]
dfout[i,2+j] <- as.data.frame(table(newdf[2,,j]))$Freq[getID]
if(is.na(dfout[i,2+j])){
dfout[i,2+j] <- 0
}
getID <- which(newdf[2,,j]==dfout[i,1])
if(length(getID)>0){
dfout[i,2+j+slength] <- toString(newdf[1,,j][getID])
} else {
dfout[i,2+j+slength] <- 0
}
}
}
colnames(dfout)<-names
as.data.frame(dfout)
# Value repeated s1N s2N ss1 ss2
#1 4 3 2 1 3, 5 2
#2 1 1 1 1 4 0
#3 15 1 0 1 0 3
#4 16 1 0 0 0 7
答案 1 :(得分:1)
出乎意料的难题。将其分解为几个合乎逻辑的步骤是很好的:
## 1: split into (val,ss) pairs, and capture ci (column index) association
res <- setNames(do.call(rbind,lapply(seq_along(df),function(ci)
do.call(rbind,lapply(strsplit(as.character(df[[ci]]),'-'),function(x)
data.frame(x[2L],x[1L],ci,stringsAsFactors=F)
))
)),c('val','ss','ci'));
res;
## val ss ci
## 1 4 3 1
## 2 1 4 1
## 3 4 5 1
## 4 4 2 2
## 5 15 3 2
## 6 16 7 2
## 2: aggregate ss (joining on comma) by (val,ci), and capture record count as n
res <- do.call(rbind,by(res,res[c('val','ci')],function(x)
data.frame(val=x$val[1L],ci=x$ci[1L],n=nrow(x),ss=paste(x$ss,collapse=','),stringsAsFactors=F)
));
res;
## val ci n ss
## 1 1 1 1 4
## 2 4 1 2 3,5
## 3 15 2 1 3
## 4 16 2 1 7
## 5 4 2 1 2
## 3: reshape to wide format
res <- reshape(res,idvar='val',timevar='ci',dir='w');
res;
## val n.1 ss.1 n.2 ss.2
## 1 1 1 4 NA <NA>
## 2 4 2 3,5 1 2
## 3 15 NA <NA> 1 3
## 4 16 NA <NA> 1 7
## 4: add repeated column; can be calculated by summing all n.* columns
## note: leveraging psum() from <http://stackoverflow.com/questions/12139431/add-variables-whilst-ignoring-nas-using-transform-function>
psum <- function(...,na.rm=F) { x <- list(...); rowSums(matrix(unlist(x),ncol=length(x)),na.rm=na.rm); };
res$repeated <- do.call(psum,c(res[grep('^n\\.[0-9]+$',names(res))],na.rm=T));
res;
## val n.1 ss.1 n.2 ss.2 repeated
## 1 1 1 4 NA <NA> 1
## 2 4 2 3,5 1 2 3
## 3 15 NA <NA> 1 3 1
## 4 16 NA <NA> 1 7 1
关于NA,如果需要,可以在以后修复它们。但是,我建议n.*
列的正确类型是整数,因为它们代表计数,因此使用'-'
(如示例输出中)来表示空单元格是不合适的。我会建议零。破折号适用于ss.*
列,因为它们是字符串。以下是如何做到这一点:
n.cis <- grep('^n\\.[0-9]+$',names(res));
ss.cis <- grep('^ss\\.[0-9]+$',names(res));
res[n.cis][is.na(res[n.cis])] <- 0L;
res[ss.cis][is.na(res[ss.cis])] <- '-';
res;
## val n.1 ss.1 n.2 ss.2 repeated
## 1 1 1 4 0 - 1
## 2 4 2 3,5 1 2 3
## 3 15 0 - 1 3 1
## 4 16 0 - 1 7 1
答案 2 :(得分:1)
我已根据您的实际数据完全重写了所有代码,并且已在我的机器上对其进行了测试。因为它是一个非常大的数据帧,所以运行需要一些时间,在我看来,循环是不可避免的。
# function to split the strings
myfun<-function(x){
x<-strsplit(as.character(x), '-')
x1<-unlist(x)
x.new<-as.data.frame(matrix(x1, byrow = T, length(x)))
return(x.new)
}
# this returns a list of dataframes
list.v<-lapply(df[1:dim(df)[2]], myfun)
# like this
head(list.v[[17]])
# try to combine all the dfs, produced an error of mismatching # of columns
df2<-do.call(rbind, list.v)
# some of the dfs in list.v are all NA's, they should be dropped
sum<-summary(list.v)
list.v<-list.v[-which(sum[,1] != "2")] # this excludes those all-NA datafrmes in list.v
# now combine all dfs for indexing purposes
df2<-do.call(rbind, list.v)
# create "value", "repeated" column in the desired result df.
# These codes are same as my previous answer
value<-names(table(df2[,2]))
repeated<-as.vector(table(df2[,2]))
# create an empty list to store the counts columns
list.count<-vector("list", length = length(list.v))
# every df in list.v has same number of rows, get the row number
rownum<-nrow(list.v[[1]])
# use a for loop to fill out list.count
for(i in 0:(length(list.count)-1)){
row.start<-i*rownum+1 # it is kind of tricky here
row.end<-(i+1)*rownum # same as above
list.count[[i + 1]]<-as.vector(table(df2[,2][row.start:row.end]))
}
# combine the vectors in list.count and assing names
count.df<-do.call(cbind, list.count)
count.df<-as.data.frame(count.df)
# create & assign colum names in the format of "s_n", and "_" is filled with corresponding original column name
names.cnt<-character()
for(i in 1:length(names(list.v))){
names.cnt[i]<-paste("s", names(list.v)[i], "n", sep="")
}
names(count.df)<-names.cnt
# this is a very long loop to concatenate the strings and store them into a matrix, but it gets the job done here.
ss.store<-matrix(,nrow = length(value), ncol = length(list.v), byrow = FALSE)
for(i in 1:length(list.v)){
for(j in 1:length(value)){
ss.store[j,i]<-paste(list.v[[i]][,1][which(list.v[[i]][,2] == value[j])], collapse =",")
}
}
# create a df for strings
string.df<-as.data.frame(ss.store, stringsAsFactors = FALSE)
# create & assign names to the df
names.str<-character()
for(i in 1:length(names(list.v))){
names.str[i]<-paste("s", "s", names(list.v)[i], sep="")
}
names(string.df)<-names.str
# combine everything and form the new data frame
new.df<-cbind(value, repeated, count.df, string.df, stringAsFactors = FALSE)
new.df[1:10, 1:15]
value repeated sAn sF1n sF2n sF3n sF4n sF5n sF6n sF7n sF8n sF9n sF10n sF11n sF12n
1 100 155 3 0 0 0 0 0 0 0 0 0 0 0 0
2 1005 14 1 0 0 0 0 0 0 0 0 0 0 0 0
3 1006 50 1 0 0 0 0 0 0 0 0 0 0 0 0
4 1023 1 1 0 0 0 0 0 0 0 0 0 0 0 0
5 1025 38 1 0 0 0 0 0 0 0 0 0 0 0 0
6 1030 624 1 0 1 2 0 0 0 0 0 0 1 0 0
7 1035 1 1 0 0 0 0 0 0 0 0 0 0 0 0
8 104 165 2 0 0 0 0 0 0 0 0 0 0 0 0
9 1076 186 2 0 0 0 0 0 0 0 0 0 0 0 0
10 1078 333 3 0 0 0 0 0 0 0 0 0 0 0 0
答案 3 :(得分:1)
df <-
structure(
list(
s1 = structure(1:3, .Label = c("3-4", "4-1", "5-4"), class = "factor"),
s2 = structure(1:3, .Label = c("2-4", "3-15", "7-16"), class = "factor"
)
), .Names = c("s1", "s2"), row.names = c(NA,-3L), class = "data.frame"
)
library(tidyr)
library(dplyr)
# Split columns at "-" and add to data.frame
splitCols <- function(df) {
new_headers <- paste("s1", c("1st", "2nd"), sep = "_")
split_1 <- (separate(df, s1, into = new_headers, sep = "-"))[,new_headers]
split_1$s1_1st <- as.integer(split_1$s1_1st)
split_1$s1_2nd <- as.integer(split_1$s1_2nd)
new_headers <- paste("s2", c("1st", "2nd"), sep = "_")
split_2 <- (separate(df, s2, into = new_headers, sep = "-"))[,new_headers]
split_2$s2_1st <- as.integer(split_2$s2_1st)
split_2$s2_2nd <- as.integer(split_2$s2_2nd)
cbind(df, split_1, split_2)
}
# given a df outputted from splitCols return final df
analyzeDF <- function(df) {
target_vals <- unique(c(df$s1_2nd, df$s2_2nd)) # for each uniq val compute stuff
out_df <- data.frame(Value = integer(0),
repeated = integer(0),
s1N = integer(0),
s2N = integer(0),
ss1 = character(0),
ss2 = character(0))
# iterate through target_vals, create a row of output,
# and append to out_df
for (val in target_vals) {
s1_match <- val == df$s1_2nd
s2_match <- val == df$s2_2nd
total_cnt <- sum(s1_match, s2_match)
s1_firstcol <- paste(df$s1_1st[s1_match], collapse = ",")
s2_firstcol <- paste(df$s2_1st[s2_match], collapse = ",")
# coerce empty string to "-"
if (s1_firstcol == "") s1_firstcol <- "-"
if (s2_firstcol == "") s2_firstcol <- "-"
row_df <- data.frame(Value = val,
repeated = total_cnt,
s1N = sum(s1_match),
s2N = sum(s2_match),
ss1 = s1_firstcol,
ss2 = s2_firstcol)
out_df <- rbind(out_df, row_df)
}
return(out_df)
}
(df_split <- splitCols(df))
analyzeDF(df_split)
## Value repeated s1N s2N ss1 ss2
## 1 4 3 2 1 3,5 2
## 2 1 1 1 0 4 -
## 3 15 1 0 1 - 3
## 4 16 1 0 1 - 7