首先让一些随机数据
A <- c(1:5)
score_one <- c(123.5, 223.1, 242.2, 351.8, 123.1)
score_two <- c(324.2, 568.2, 124.9, 323.1, 213.4)
score_three <- c(553.1, 412.3, 435.7, 523.1, 365.4)
score_four <- c(123.2, 225.1, 243.6, 741.1, 951.2)
df1 <- data.frame(A, score_one, score_two, score_three, score_four)
library(dplyr)
library(tidyr)
df2 <- df1 %>%
group_by(A) %>%
mutate_each(funs(substr(.,1,1))) %>%
ungroup %>%
gather(variable, type, -c(A)) %>%
select(-variable) %>%
mutate(type = paste0("type_",type),
value = 1) %>%
group_by(A,type) %>%
summarise(value = sum(value)) %>%
ungroup %>%
spread(type, value, fill=0) %>%
inner_join(df1, by=c("A")) %>%
select(A, starts_with("score_"), starts_with("type_"))
这为每个score_
引入了一个摘要变量
并计算每个唯一第一位数
因此我们在第一行看到,type_1 == 2.因为在相应的score_
列中,我们有2次出现,其中数字1是第一个数字
问题陈述
现在我们要引入一个调用type_n
列的变量。
score_
列/ s type_n == 0
,我们要分配0 $type_n_G2
这样所需的输出应该看起来像1
以type_1_G2
type_1 == 2
score_one
和score_four
type_1_G2==1
答案 0 :(得分:7)
在我看来,df1
并不需要复杂的构造。将重新整形的data.table
转换为长格式是一个更好的起点,可以用更少的步骤获得所需的最终结果。
使用library(data.table)
# melting the original dataframe 'df1' to a long format datatable
dt <- melt(setDT(df1), "A")
# creating two type variables & a logical vector indicating whether
# the decimal for a specific type is equal or above .2
dt[, `:=` (type1=paste0("type_",substr(value,1,1)),
type2=paste0("type_",substr(value,1,1),"_g2"))
][, g2 := +(+(value - floor(value) >= 0.2)==1), .(A,type1)]
# creating separate wide datatables for the variable & two type columns
dt1 <- dcast(dt, A ~ variable)
dt2 <- dcast(dt, A ~ type1)
dt3 <- dcast(dt, A ~ type2, fun=sum, value.var="g2")[, lapply(.SD, function(x) +(x>=1)), A]
# two options for merging the wide datatables together into one
dtres <- dt1[dt2[dt3, on = "A"], on = "A"]
dtres <- Reduce(function(...) merge(..., all = TRUE, by = "A"), list(dt1, dt2, dt3))
# or in one go without creating intermediate datatables
dtres <- dcast(dt, A ~ variable)[dcast(dt, A ~ type1)[dcast(dt, A ~ type2, fun=sum, value.var = "g2")[, lapply(.SD, function(x) +(x>=1)) , A], on = "A"], on = "A"]
包的方法:
> dtres
A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1: 1 123.5 324.2 553.1 123.2 2 0 1 0 1 0 0 1 0 0 0 0 0 0
2: 2 223.1 568.2 412.3 225.1 0 2 0 1 1 0 0 0 0 0 1 1 0 0
3: 3 242.2 124.9 435.7 243.6 1 2 0 1 0 0 0 1 1 0 1 0 0 0
4: 4 351.8 323.1 523.1 741.1 0 0 2 0 1 1 0 0 0 1 0 0 0 0
5: 5 123.1 213.4 365.4 951.2 1 1 1 0 0 0 1 0 1 1 0 0 0 1
这导致:
dplyr
此方法可以转换为tidyr
/ library(dplyr)
library(tidyr)
df <- df1 %>% gather(variable, value,-A) %>%
mutate(type1 = paste0("type_",substr(value,1,1)),
type2 = paste0("type_",substr(value,1,1),"_g2")) %>%
group_by(A,type1) %>%
mutate(g2 = +(+(value - floor(value) >= 0.2)==1),
type1n = n()) %>%
ungroup()
d1 <- df %>% select(1:3) %>% spread(variable, value)
d2 <- df %>% group_by(A, type1) %>% tally() %>% spread(type1, n, fill=0)
d3 <- df %>% group_by(A, type2) %>% summarise(g = any(g2==1)) %>% spread(type2, g, fill=0)
dfres <- left_join(d1, d2, by = "A") %>% left_join(., d3, by = "A")
实现,如下所示:
> dfres
A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1 1 123.5 324.2 553.1 123.2 2 0 1 0 1 0 0 1 0 0 0 0 0 0
2 2 223.1 568.2 412.3 225.1 0 2 0 1 1 0 0 0 0 0 1 1 0 0
3 3 242.2 124.9 435.7 243.6 1 2 0 1 0 0 0 1 1 0 1 0 0 0
4 4 351.8 323.1 523.1 741.1 0 0 2 0 1 1 0 0 0 1 0 0 0 0
5 5 123.1 213.4 365.4 951.2 1 1 1 0 0 0 1 0 1 1 0 0 0 1
给出相同的结果:
{
Title: "Cheese Fondling",
Body: "I love cheese, especially paneer mozzarella. Roquefort cheeseburger cut the cheese fondue edam taleggio cheese slices gouda. Dolcelatte croque monsieur cottage cheese camembert de normandie cheese slices st. agur blue cheese bavarian bergkase swiss. Edam cheesecake parmesan.",
}
答案 1 :(得分:5)
以下是使用melt
包对数据进行首次dcast
然后data.table
的向量化尝试。它需要一些润色,但我现在没有时间
library(data.table) # v >= 1.9.6
# melt and order by "A"
temp <- setorder(melt(df2, id = 1:5), A)
# Create the "type_n_G2" column names
temp$Var <- paste0(temp$variable, "_G2")
# Selecting only the "score_one", "score_two", "score_three" and "score_four"
indx1 <- indx2 <- temp[2:5]
# Finding the first integer within each number
indx2[] <- sub("(^.{1}).*", "\\1", as.matrix(indx2))
# The works horse: simultaneously compare `indx2` against `type_n` and extract decimals
indx3 <- indx1 * (indx2 == as.numeric(sub(".*_", "", temp$variable))) - floor(indx1)
# Compare the result against 0.2, sum the rows and see if any is greater than 0
temp$res<- +(rowSums(indx3 >= 0.2) > 0)
# Convert back to wide format
dcast(temp, A ~ Var, value.var = "res")
# A type_1_G2 type_2_G2 type_3_G2 type_4_G2 type_5_G2 type_7_G2 type_9_G2
# 1 1 1 0 0 0 0 0 0
# 2 2 0 1 0 1 1 0 0
# 3 3 1 1 0 1 0 0 0
# 4 4 0 0 1 0 0 0 0
# 5 5 0 1 1 0 0 0 1
现在您可以cbind
将结果df2
<intent-filter>
<action android:name="android.intent.action.VIEW" />
<category android:name="android.intent.category.DEFAULT" />
<category android:name="android.intent.category.BROWSABLE" />
<data android:scheme="fivos"
android:host="book"
/>
</intent-filter>
(这与您的结果不完全匹配,因为您提供的数据也不匹配)
答案 2 :(得分:3)
以下是尝试,将您的数据转换为long
格式,以便为每个值保留type
变量。这样就可以更容易地计算第二步中有多少小数>=2
。
library(tidyr)
#transform df1 to the long format
df <- df1 %>% gather(key, value, -A)
#calculate the type for each line
#this can be done by extracting the first digit and pasting
# "_type" in front of it
df$type <- as.factor(paste("type",sapply(strsplit(as.character(df$value),""),function(x) x[[1]]),sep="_"))
#expand the levels to add missing types
levels(df$type) <- c(levels(df$type),setdiff(paste("type",1:9,sep="_"),levels(df$type)))
#create a new column that holds the first decimal
#I assumed there was only one decimal for each number
#but you can adapt this
df$first_decimal <- as.numeric(sapply(strsplit(as.character(df$value),"[.]"),function(x) x[[2]]))
#group by A and type, if any first_decimal is bigger than 2
#G2 will be set to one for that group
df <- df %>% group_by(A,type) %>% mutate(G2=any(first_decimal>=2)*1)
#create a type_G2 column to hold the final column labels
df$type_G2 <- paste0(df$type,"_G2")
#this cbind creates the final result
cbind(df1,as.data.frame.matrix(table(df[,c("A","type")])),spread(unique(df[,c("A","type_G2","G2")]),key=type_G2,value=G2,drop=FALSE,fill=0)[,-1])
最后一个cbind的细分:
df1
是原始数据框
as.data.frame.matrix(table(df[,c("A","type")]))
是一个数据框,其中包含每个type
spread(unique(df[,c("A","type_G2","G2")]),key=type_G2,value=G2,drop=FALSE,fill=0)[,-1]
包含type_G2
信息。我对子集化的df是唯一的,因为有一些冗余信息(例如type_1_G2
对于第一行的值123.5和123.1是相同的。)
答案 3 :(得分:1)
免责声明:再次重新阅读问题之后,我的回答是错误的(至少在结果中过于复杂),如果您希望将十进制值与每个第一个数字的出现次数进行比较,请将其保留。
如果您愿意将得分十进制与此行中的每个type_N值进行比较,这是一种方式,希望这里的聪明人能够改善这一点:
decimalscores <- (df2[grepl("score_*",colnames(df2))] - floor(df2[grepl("score_*",colnames(df2))]))*10 # Get the decimal, as per the sample only one digit
typesindex <- as.numeric(sub("type_","",colnames(df2[grepl("type_*",colnames(df2))]))) # get the type_"n" columns names to reuse later
res <- t(sapply(1:nrow(df2),function(x) { # loop over the dataframe rows
sapply(typesindex,function(y) { # For each type index
colname <- paste0("type_",y)
cmptype <- unlist(unname(df2[x,colname]))
# create the result if type_n is above 0
ifelse(cmptype > 0,
any(unlist(unname(decimalscores[x,])) >= cmptype)+0L, # If one score is above the value return 1
0) # Else return 0
})
}))
colnames(res) <- paste0("type_",typesindex,"_G2") # Name the resulting columns by adding _G2 to ouptut
res <- as.data.frame(res) # turn matrix into dataframe
df3 <- cbind(df2,res) # bind them to get expected output
我希望评论足够解释,如果有什么不清楚,请告诉我。