数据框r和二进制列

时间:2018-04-22 22:03:40

标签: r

我有一个像这样的数据框

df=  data.frame(
    text= c("test and run", "rest and sleep", "test", "test of course"), 
    id = c('a','b','c','d'))
#            text id
#1   test and run  a
#2 rest and sleep  b
#3           test  c
#4 test of course  d

我想

  1. 以紧凑的方式(没有循环)来获得列文本中前2个最重复的单词(" test" 3 - "和" 2)

  2. 创建/添加与前2个值匹配的二进制列。

  3. topTextBinary 
    1, 1
    0, 1
    1, 0
    1, 0
    

    for" test","和"

                text id topTextBinary
    1   test and run  a          1, 1
    2 rest and sleep  b          0, 1
    3           test  c          1, 0
    4 test of course  d          1, 0
    

    谢谢

    R studio version

    platform       x86_64-w64-mingw32          
    arch           x86_64                      
    os             mingw32                     
    system         x86_64, mingw32             
    status                                     
    major          3                           
    minor          4.3                         
    year           2017                        
    month          11                          
    day            30                          
    svn rev        73796                       
    language       R                           
    version.string R version 3.4.3 (2017-11-30)
    nickname       Kite-Eating Tree      
    

2 个答案:

答案 0 :(得分:1)

我们可以做到以下几点:

# Word frequency table
tbl <- table(unlist(strsplit(as.character(df$text), " ")));

# Top 2 words
top <- tbl[order(tbl, decreasing = T)][1:2];

# Flag top2 words per row
library(tidyverse);
map(names(top), ~ df %>%
    mutate(!!.x := as.numeric(grepl(.x, text)))) %>%
    reduce(left_join)
#Joining, by = c("text", "id")
#            text id test and
#1   test and run  a    1   1
#2 rest and sleep  b    0   1
#3           test  c    1   0
#4 test of course  d    1   0

从2个二进制列中的unite个条目到一个列:

map(names(top), ~ df %>%
    mutate(!!.x := as.numeric(grepl(.x, text)))) %>%
    reduce(left_join) %>%
    unite(topTextBinary, -(1:2), sep = ", ");
#            text id topTextBinary
#1   test and run  a          1, 1
#2 rest and sleep  b          0, 1
#3           test  c          1, 0
#4 test of course  d          1, 0

答案 1 :(得分:1)

使用Base R:

top2=names(sort(table(unlist(strsplit(as.character(df$text),"\\s"))),T))[1:2]

transform(df,m=paste(grepl(top2[1],text)+0,grepl(top2[2],text)+0,sep=","))
            text id   m
1   test and run  a 1,1
2 rest and sleep  b 0,1
3           test  c 1,0
4 test of course  d 1,0

如果目的是将其用于3,4或甚至前10个单词,那么您可能会考虑做类似的事情:

transform(df,m=do.call(paste,c(sep=",",data.frame(t(outer(top2,df$text,Vectorize(grepl))+0L)))))
            text id   m
1   test and run  a 1,1
2 rest and sleep  b 0,1
3           test  c 1,0
4 test of course  d 1,0