我确实有一个关于功能构建的问题。我尝试完成以下任务:
我确实有一个数据集df
,其中包含几个变量,以及一个文本向量。该文本向量应进行清理(删除,删除停用词等),通过监督的分类算法进行处理并存储结果(readme.results$est.CSMF[[2]]
)。
该函数编写起来并不难,但是让人头疼的是该函数的重复。我想在唯一的发言人列表和不同时间点重复/循环此列表。
df_select <- df[df$date >= "1990-12-20" & df$date <= "1994-11-10",]
liste = list()
for(i in 1:)
{
df_select <- subset(df_select, df_select$speaker == paste(speaker_list[i]))
{
complete_fun <- function(){
df_sample <- sample_n(df_select , 200, replace = T)
df_sample$text <- as.character(df_sample$text)
df_sample$text <- tolower(df_sample$text)
df_sample$text <- tm::removeNumbers(df_sample$text)
df_sample$text <- tm::removePunctuation(df_sample$text)
df_sample$text <- str_replace_all(df_sample$text, " ", "")
df_sample$text <- tm::removeWords(x = df_sample$text, stopwords(kind = "german"))
setwd("~/test")
fn1 <- function(N) {
for(i in 1:length(N)) {
file.out <- paste("data", i, ".txt", sep = "")
write.table(N[i], file.out)
}
}
fn1(df_sample$text)
#### README - Classification Algorithm ####
undergrad.results = undergrad(sep = ";", ignore.case=T, stem=T)
undergrad.preprocess <- preprocess(undergrad.results)
readme.results <- readme(undergrad.preprocess,n.subset=300)
readme.results$est.CSMF[[2]]
}
liste[[length(liste)+1]] = readme.results$est.CSMF[[2]]
}
df_select <- df
}
答案 0 :(得分:0)
没有您的数据,我没有机会进行测试,但是这样的方法会更好。
自行定义功能。
setwd("~/test")
fn1 <- function(N) {
for(i in 1:length(N)) {
file.out <- paste("data", i, ".txt", sep = "")
write.table(N[i], file.out)
}
}
complete_fun <- function(x){
df_sample <- sample_n(x, 200, replace = T)
df_sample$text <- as.character(df_sample$text)
df_sample$text <- tolower(df_sample$text)
df_sample$text <- tm::removeNumbers(df_sample$text)
df_sample$text <- tm::removePunctuation(df_sample$text)
df_sample$text <- str_replace_all(df_sample$text, " ", "")
df_sample$text <- tm::removeWords(x = df_sample$text, stopwords(kind = "german"))
fn1(df_sample$text)
#### README - Classification Algorithm ####
undergrad.results = undergrad(sep = ";", ignore.case=T, stem=T)
undergrad.preprocess <- preprocess(undergrad.results)
readme.results <- readme(undergrad.preprocess,n.subset=300)
readme.results$est.CSMF[[2]]
}
然后我们可以创建一个使用此函数的循环并将其输出存储在列表中
df_select <- df[df$date >= "1990-12-20" & df$date <= "1994-11-10",]
liste = list()
for (i in 1:length(speaker_list))
{
df_select <- subset(df_select, df_select$speaker == paste(speaker_list[i]))
liste[[i]] <- complete_fun(df_select)
#not necessary
# df_select <- df
}