字符位置标识以创建新变量

时间:2015-09-29 13:29:09

标签: r dplyr tidyr

首先让一些随机数据

A <- c(1:5)
score_one <- c(123.5, 223.1, 242.2, 351.8, 123.1)
score_two <- c(324.2, 568.2, 124.9, 323.1, 213.4)
score_three <- c(553.1, 412.3, 435.7, 523.1, 365.4)
score_four <- c(123.2, 225.1, 243.6, 741.1, 951.2)


df1 <- data.frame(A, score_one, score_two, score_three, score_four)

library(dplyr)
library(tidyr)

df2 <- df1 %>% 
  group_by(A) %>% 
  mutate_each(funs(substr(.,1,1))) %>%                
  ungroup %>%
  gather(variable, type, -c(A)) %>%                     
  select(-variable) %>%
  mutate(type = paste0("type_",type),
         value = 1) %>%
  group_by(A,type) %>%                                     
  summarise(value = sum(value)) %>% 
  ungroup %>%
  spread(type, value, fill=0) %>%                       
  inner_join(df1, by=c("A")) %>%                            
  select(A, starts_with("score_"), starts_with("type_")) 

这为每个score_引入了一个摘要变量 并计算每个唯一第一位数

的频率

因此我们在第一行看到,type_1 == 2.因为在相应的score_列中,我们有2次出现,其中数字1是第一个数字

问题陈述
现在我们要引入一个调用type_n列的变量。

  • 检查值&gt; 0
  • 在这种情况下,我们要检查相应的score_列/ s
  • 这里我们分析小数之后的数字是否>&gt; =而不是值2
  • 现在,如果小数点后面的一个或所有相应行的值为&gt; = 2,我们要指定值1
  • 如果小数点后的所有相应行的值都是&lt; 2,我们要分配值0
  • 并且相反,如果type_n == 0我们要分配0
  • 说我们将此变量命名为$type_n_G2

这样所需的输出应该看起来像1

enter image description here

type_1_G2

为例
  • 我们有type_1 == 2
  • 我们在score_onescore_four
  • 处拥有相应的身份
  • 小数点后的两个值都是&gt; = 2,因此我们分配type_1_G2==1

4 个答案:

答案 0 :(得分:7)

在我看来,df1并不需要复杂的构造。将重新整形的data.table转换为长格式是一个更好的起点,可以用更少的步骤获得所需的最终结果。

使用library(data.table) # melting the original dataframe 'df1' to a long format datatable dt <- melt(setDT(df1), "A") # creating two type variables & a logical vector indicating whether # the decimal for a specific type is equal or above .2 dt[, `:=` (type1=paste0("type_",substr(value,1,1)), type2=paste0("type_",substr(value,1,1),"_g2")) ][, g2 := +(+(value - floor(value) >= 0.2)==1), .(A,type1)] # creating separate wide datatables for the variable & two type columns dt1 <- dcast(dt, A ~ variable) dt2 <- dcast(dt, A ~ type1) dt3 <- dcast(dt, A ~ type2, fun=sum, value.var="g2")[, lapply(.SD, function(x) +(x>=1)), A] # two options for merging the wide datatables together into one dtres <- dt1[dt2[dt3, on = "A"], on = "A"] dtres <- Reduce(function(...) merge(..., all = TRUE, by = "A"), list(dt1, dt2, dt3)) # or in one go without creating intermediate datatables dtres <- dcast(dt, A ~ variable)[dcast(dt, A ~ type1)[dcast(dt, A ~ type2, fun=sum, value.var = "g2")[, lapply(.SD, function(x) +(x>=1)) , A], on = "A"], on = "A"] 包的方法:

> dtres
   A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1: 1     123.5     324.2       553.1      123.2      2      0      1      0      1      0      0         1         0         0         0         0         0         0
2: 2     223.1     568.2       412.3      225.1      0      2      0      1      1      0      0         0         0         0         1         1         0         0
3: 3     242.2     124.9       435.7      243.6      1      2      0      1      0      0      0         1         1         0         1         0         0         0
4: 4     351.8     323.1       523.1      741.1      0      0      2      0      1      1      0         0         0         1         0         0         0         0
5: 5     123.1     213.4       365.4      951.2      1      1      1      0      0      0      1         0         1         1         0         0         0         1

这导致:

dplyr

此方法可以转换为tidyr / library(dplyr) library(tidyr) df <- df1 %>% gather(variable, value,-A) %>% mutate(type1 = paste0("type_",substr(value,1,1)), type2 = paste0("type_",substr(value,1,1),"_g2")) %>% group_by(A,type1) %>% mutate(g2 = +(+(value - floor(value) >= 0.2)==1), type1n = n()) %>% ungroup() d1 <- df %>% select(1:3) %>% spread(variable, value) d2 <- df %>% group_by(A, type1) %>% tally() %>% spread(type1, n, fill=0) d3 <- df %>% group_by(A, type2) %>% summarise(g = any(g2==1)) %>% spread(type2, g, fill=0) dfres <- left_join(d1, d2, by = "A") %>% left_join(., d3, by = "A") 实现,如下所示:

> dfres
  A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1 1     123.5     324.2       553.1      123.2      2      0      1      0      1      0      0         1         0         0         0         0         0         0
2 2     223.1     568.2       412.3      225.1      0      2      0      1      1      0      0         0         0         0         1         1         0         0
3 3     242.2     124.9       435.7      243.6      1      2      0      1      0      0      0         1         1         0         1         0         0         0
4 4     351.8     323.1       523.1      741.1      0      0      2      0      1      1      0         0         0         1         0         0         0         0
5 5     123.1     213.4       365.4      951.2      1      1      1      0      0      0      1         0         1         1         0         0         0         1

给出相同的结果:

{
Title: "Cheese Fondling",

Body: "I love cheese, especially paneer mozzarella. Roquefort cheeseburger cut the cheese fondue edam taleggio cheese slices gouda. Dolcelatte croque monsieur cottage cheese camembert de normandie cheese slices st. agur blue cheese bavarian bergkase swiss. Edam cheesecake parmesan.",
}

答案 1 :(得分:5)

以下是使用melt包对数据进行首次dcast然后data.table的向量化尝试。它需要一些润色,但我现在没有时间

library(data.table) # v >= 1.9.6
# melt and order by "A" 
temp <- setorder(melt(df2, id = 1:5), A)

# Create the "type_n_G2" column names
temp$Var <- paste0(temp$variable, "_G2")

# Selecting only the "score_one", "score_two", "score_three" and "score_four"
indx1 <- indx2 <- temp[2:5]

# Finding the first integer within each number
indx2[] <- sub("(^.{1}).*", "\\1", as.matrix(indx2))

# The works horse: simultaneously compare `indx2` against `type_n` and extract decimals
indx3 <- indx1 * (indx2 == as.numeric(sub(".*_", "", temp$variable))) - floor(indx1)

# Compare the result against 0.2, sum the rows and see if any is greater than 0
temp$res<- +(rowSums(indx3 >= 0.2) > 0)

# Convert back to wide format
dcast(temp, A ~ Var, value.var = "res")
#   A type_1_G2 type_2_G2 type_3_G2 type_4_G2 type_5_G2 type_7_G2 type_9_G2
# 1 1         1         0         0         0         0         0         0
# 2 2         0         1         0         1         1         0         0
# 3 3         1         1         0         1         0         0         0
# 4 4         0         0         1         0         0         0         0
# 5 5         0         1         1         0         0         0         1

现在您可以cbind将结果df2 <intent-filter> <action android:name="android.intent.action.VIEW" /> <category android:name="android.intent.category.DEFAULT" /> <category android:name="android.intent.category.BROWSABLE" /> <data android:scheme="fivos" android:host="book" /> </intent-filter> (这与您的结果不完全匹配,因为您提供的数据也不匹配)

答案 2 :(得分:3)

以下是尝试,将您的数据转换为long格式,以便为每个值保留type变量。这样就可以更容易地计算第二步中有多少小数>=2

library(tidyr)

#transform df1 to the long format
df <- df1 %>% gather(key, value, -A)

 #calculate the type for each line
 #this can be done by extracting the first digit and pasting 
 # "_type" in front of it
df$type <- as.factor(paste("type",sapply(strsplit(as.character(df$value),""),function(x) x[[1]]),sep="_"))

 #expand the levels to add missing types
levels(df$type) <- c(levels(df$type),setdiff(paste("type",1:9,sep="_"),levels(df$type)))

#create a new column that holds the first decimal
#I assumed there was only one decimal for each number 
#but you can adapt this
df$first_decimal <- as.numeric(sapply(strsplit(as.character(df$value),"[.]"),function(x) x[[2]]))

#group by A and type, if any first_decimal is bigger than 2
#G2 will be set to one for that group
df <- df %>% group_by(A,type) %>% mutate(G2=any(first_decimal>=2)*1)

#create a type_G2 column to hold the final column labels
df$type_G2 <- paste0(df$type,"_G2")

#this cbind creates the final result
cbind(df1,as.data.frame.matrix(table(df[,c("A","type")])),spread(unique(df[,c("A","type_G2","G2")]),key=type_G2,value=G2,drop=FALSE,fill=0)[,-1])

最后一个cbind的细分:

df1是原始数据框

as.data.frame.matrix(table(df[,c("A","type")]))是一个数据框,其中包含每个type

的数量

spread(unique(df[,c("A","type_G2","G2")]),key=type_G2,value=G2,drop=FALSE,fill=0)[,-1]包含type_G2信息。我对子集化的df是唯一的,因为有一些冗余信息(例如type_1_G2对于第一行的值123.5和123.1是相同的。)

答案 3 :(得分:1)

免责声明:再次重新阅读问题之后,我的回答是错误的(至少在结果中过于复杂),如果您希望将十进制值与每个第一个数字的出现次数进行比较,请将其保留。

如果您愿意将得分十进制与此行中的每个type_N值进行比较,这是一种方式,希望这里的聪明人能够改善这一点:

decimalscores <- (df2[grepl("score_*",colnames(df2))] - floor(df2[grepl("score_*",colnames(df2))]))*10 # Get the decimal, as per the sample only one digit 
typesindex <- as.numeric(sub("type_","",colnames(df2[grepl("type_*",colnames(df2))]))) # get  the type_"n" columns names to reuse later
res <- t(sapply(1:nrow(df2),function(x) { # loop over the dataframe rows
    sapply(typesindex,function(y) { # For each type index  
        colname <- paste0("type_",y)
        cmptype <- unlist(unname(df2[x,colname]))
        # create the result if type_n is above 0 
        ifelse(cmptype > 0,
               any(unlist(unname(decimalscores[x,])) >= cmptype)+0L, # If one score is above the value return 1
               0) # Else return 0
     })
  }))
colnames(res) <- paste0("type_",typesindex,"_G2") # Name the resulting columns by adding _G2 to ouptut
res <- as.data.frame(res) # turn matrix into dataframe
df3 <- cbind(df2,res) # bind them to get expected output

我希望评论足够解释,如果有什么不清楚,请告诉我。