在特定阈值下使用来自另一个数据帧的术语频率列表

时间:2017-06-11 08:38:55

标签: r

我的数据集是:

    sparsedf <- structure(list(colname1 = structure(c(8L, 3L, 4L, 7L, 2L, 6L, 
5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), .Label = c("", "price106", "price142", "price185", 
"price655", "price67", "price753", "price99"), class = "factor"), 
    colname2 = structure(c(2L, 3L, 8L, 15L, 5L, 4L, 12L, 9L, 
    10L, 7L, 11L, 6L, 13L, 14L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("", "price100", "price143", 
    "price16", "price271", "price29", "price3", "price36", "price391", 
    "price433", "price505", "price56", "price578", "price655", 
    "price753"), class = "factor"), colname3 = structure(c(2L, 
    8L, 4L, 5L, 6L, 7L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
    17L, 19L, 20L, 3L, 18L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
    "price101", "price106", "price186", "price228", "price272", 
    "price314", "price33", "price354", "price392", "price434", 
    "price469", "price506", "price541", "price579", "price615", 
    "price652", "price67", "price686", "price720"), class = "factor"), 
    colname4 = structure(c(2L, 3L, 8L, 5L, 9L, 6L, 18L, 7L, 13L, 
    10L, 19L, 12L, 14L, 16L, 11L, 15L, 4L, 17L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("", "price102", "price144", 
    "price20", "price229", "price315", "price393", "price4", 
    "price46", "price470", "price52", "price542", "price55", 
    "price580", "price6", "price616", "price655", "price7", "price753"
    ), class = "factor"), colname6 = structure(c(1L, 2L, 3L, 
    4L, 6L, 7L, 8L, 9L, 10L, 11L, 5L, 12L, 13L, 14L, 15L, 16L, 
    17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L), .Label = c("price104", 
    "price146", "price188", "price231", "price25", "price274", 
    "price317", "price356", "price395", "price436", "price472", 
    "price544", "price582", "price618", "price654", "price687", 
    "price722", "price752", "price779", "price809", "price835", 
    "price857", "price881", "price904", "price926", "price947", 
    "price966"), class = "factor"), colname7 = structure(c(2L, 
    4L, 5L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
    "price105", "price106", "price147", "price189"), class = "factor"), 
    colname9 = structure(c(2L, 3L, 4L, 5L, 6L, 7L, 11L, 8L, 9L, 
    10L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), .Label = c("", "price107", "price149", "price191", 
    "price233", "price276", "price319", "price397", "price438", 
    "price474", "price57"), class = "factor"), colname11 = structure(c(2L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L, 16L, 
    19L, 17L, 10L, 18L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
    "price109", "price12", "price193", "price235", "price278", 
    "price321", "price359", "price399", "price40", "price440", 
    "price475", "price511", "price547", "price585", "price621", 
    "price689", "price754", "price78"), class = "factor"), colname12 = structure(c(2L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 15L, 16L, 
    17L, 18L, 19L, 20L, 21L, 22L, 23L, 10L, 24L, 25L, 26L, 27L, 
    1L), .Label = c("", "price110", "price150", "price194", "price236", 
    "price279", "price322", "price360", "price400", "price42", 
    "price441", "price476", "price512", "price548", "price586", 
    "price622", "price656", "price690", "price725", "price755", 
    "price782", "price812", "price838", "price884", "price907", 
    "price929", "price950"), class = "factor"), colname13 = structure(c(3L, 
    5L, 6L, 7L, 8L, 9L, 10L, 11L, 4L, 17L, 12L, 13L, 14L, 15L, 
    16L, 18L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
    "price106", "price11", "price12", "price13", "price15", "price237", 
    "price280", "price323", "price361", "price401", "price513", 
    "price549", "price587", "price623", "price657", "price67", 
    "price753"), class = "factor")), .Names = c("colname1", "colname2", 
"colname3", "colname4", "colname6", "colname7", "colname9", "colname11", 
"colname12", "colname13"), class = "data.frame", row.names = c(NA, 
-27L))

我想使用以下列表(frequency_term_df),其中包含第一列中每个术语的术语名称,在第二列中包含每个术语的频率。 我想将此列表用于上述数据帧,并删除包含频率等于或低于2的条款。

有可能成功吗?

    frequency_term_df <- structure(list(name = c("price99", "price100", "price101", "price102", 
"price104", "price105", "price107", "price109", "price110", "price11", 
"price142", "price143", "price33", "price144", "price146", "price147", 
"price149", "price12", "price150", "price13", "price185", "price36", 
"price186", "price4", "price188", "price189", "price191", "price193", 
"price194", "price15", "price753", "price228", "price229", "price231", 
"price106", "price233", "price235", "price236", "price237", "price271", 
"price272", "price46", "price274", "", "price276", "price278", 
"price279", "price280", "price67", "price16", "price314", "price315", 
"price317", "price319", "price321", "price322", "price323", "price655", 
"price56", "price354", "price7", "price356", "price57", "price359", 
"price360", "price361", "price391", "price392", "price393", "price395", 
"price397", "price399", "price400", "price401", "price433", "price434", 
"price55", "price436", "price438", "price440", "price441", "price3", 
"price469", "price470", "price472", "price474", "price475", "price476", 
"price505", "price506", "price25", "price511", "price512", "price513", 
"price29", "price541", "price542", "price544", "price547", "price548", 
"price549", "price578", "price579", "price580", "price582", "price585", 
"price586", "price587", "price615", "price616", "price618", "price621", 
"price622", "price623", "price652", "price52", "price654", "price78", 
"price656", "price657", "price686", "price6", "price687", "price689", 
"price690", "price720", "price20", "price722", "price40", "price725", 
"price752", "price754", "price755", "price779", "price782", "price809", 
"price812", "price835", "price838", "price857", "price42", "price881", 
"price884", "price904", "price907", "price926", "price929", "price947", 
"price950", "price966"), Number = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 110L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
)), row.names = c(NA, -150L), class = c("data.table", "data.frame"
), .Names = c("name", "Number"), .internal.selfref = <pointer: 0x00000000003b0788>)

1 个答案:

答案 0 :(得分:1)

我假设通过&#34;删除&#34;你的意思是将值转换为空白字符串。 首先,确定频率<= 2的名称。

freq2 <- frequency_term_df$name[frequency_term_df$Number <= 2]

然后使用ifelse与sapply一起浏览每一栏,并用空格替换你不想要的名字。

sparsedf2 <- as.data.frame(sapply(sparsedf, FUN = function(x) ifelse(x %in% freq2, "", as.character(x))))