在R中创建一个表,其中包含多个列,字符串频率随时间变化

时间:2018-02-20 16:06:17

标签: r frequency hashtag summary

我的数据框看起来像这样:

     Day           text          place    gender
Feb 20 2016   #geom and #stats      SP          M
Feb 20 2016   #geom and #stats      SP          F
Feb 20 2016   #bio and #stats       SP          M

我想从“text”中提取主题标签然后用这些信息构建一个表(摘要):

Day          Hashtag    Daily_Freq  %men    %women  Freq_UK Freq_SP
Feb 20 2016   #stats      2              0.5      0.5     1       1
Feb 20 2016   #maths      1              1        0       1       0
Feb 20 2016   #geom       1              0        1       0       1

我不知道如何做到这一点!谁能帮我?

1 个答案:

答案 0 :(得分:0)

options(stringsAsFactors = FALSE)
df = read.table(text = " Day           text            place           gender
                        'Feb 20 2016'   '#stats and #maths'      UK          M
                        'Feb 20 2016'   '#geom and #stats'       SP          F", 
                 header = TRUE)

# extract tags
tags =  lapply(strsplit(df$text, split = "[\\s,\\t]+", perl = TRUE), 
           function(item) item[substr(item, 1, 1)=="#"])

# create list of data.frames 
long_list1 = lapply(seq_len(NROW(df)), function(i) {
    data.frame(
        Day = df[["Day"]][i],
        Hashtag = tags[[i]],
        place = df[["place"]][i],
        gender = df[["gender"]][i]
    )
})

# long form - each hashtag on each own row
long = do.call(rbind, long_list1)

# compute list of data.frames with statistics 
long_list2 = 
        lapply(
            split(long, list(long$Day, long$Hashtag)), 
            function(item){
                with(item, data.frame(
                    Day = Day[1],
                    Hashtag = Hashtag[1],
                    Daily_Freq  = NROW(item), 
                    '%men' = mean(gender == "M"),   
                    '%women' = mean(gender == "F"),   
                    Freq_UK  = sum(place == "UK"), 
                    Freq_SP = sum(place == "SP"),
                    check.names = FALSE

                ))
            })

# combine result
res = do.call(rbind, c(long_list2, make.row.names = FALSE))
res
# 
#         Day Hashtag Daily_Freq %men %women Freq_UK Freq_SP
# 1 Feb 20 2016   #geom          1  0.0    1.0       0       1
# 2 Feb 20 2016  #maths          1  1.0    0.0       1       0
# 3 Feb 20 2016  #stats          2  0.5    0.5       1       1