我的数据框看起来像这样:
Day text place gender
Feb 20 2016 #geom and #stats SP M
Feb 20 2016 #geom and #stats SP F
Feb 20 2016 #bio and #stats SP M
我想从“text”中提取主题标签然后用这些信息构建一个表(摘要):
Day Hashtag Daily_Freq %men %women Freq_UK Freq_SP
Feb 20 2016 #stats 2 0.5 0.5 1 1
Feb 20 2016 #maths 1 1 0 1 0
Feb 20 2016 #geom 1 0 1 0 1
我不知道如何做到这一点!谁能帮我?
答案 0 :(得分:0)
options(stringsAsFactors = FALSE)
df = read.table(text = " Day text place gender
'Feb 20 2016' '#stats and #maths' UK M
'Feb 20 2016' '#geom and #stats' SP F",
header = TRUE)
# extract tags
tags = lapply(strsplit(df$text, split = "[\\s,\\t]+", perl = TRUE),
function(item) item[substr(item, 1, 1)=="#"])
# create list of data.frames
long_list1 = lapply(seq_len(NROW(df)), function(i) {
data.frame(
Day = df[["Day"]][i],
Hashtag = tags[[i]],
place = df[["place"]][i],
gender = df[["gender"]][i]
)
})
# long form - each hashtag on each own row
long = do.call(rbind, long_list1)
# compute list of data.frames with statistics
long_list2 =
lapply(
split(long, list(long$Day, long$Hashtag)),
function(item){
with(item, data.frame(
Day = Day[1],
Hashtag = Hashtag[1],
Daily_Freq = NROW(item),
'%men' = mean(gender == "M"),
'%women' = mean(gender == "F"),
Freq_UK = sum(place == "UK"),
Freq_SP = sum(place == "SP"),
check.names = FALSE
))
})
# combine result
res = do.call(rbind, c(long_list2, make.row.names = FALSE))
res
#
# Day Hashtag Daily_Freq %men %women Freq_UK Freq_SP
# 1 Feb 20 2016 #geom 1 0.0 1.0 0 1
# 2 Feb 20 2016 #maths 1 1.0 0.0 1 0
# 3 Feb 20 2016 #stats 2 0.5 0.5 1 1