关于加权数据的不相干结果

时间:2016-04-14 17:25:36

标签: r statistics

我的帖子是关于我获得的结果,似乎是不连贯的。 我的数据是:

DF=structure(list(Age = c(16L, 29L, 22L, 64L, 42L, 46L, 30L, 37L, 
31L, 52L, 44L, 54L, 23L, 22L, 42L, 39L, 39L, 51L, 25L, 64L, 55L, 
56L, 27L, 31L, 39L, 22L, 54L, 33L, 34L, 18L, 39L, 41L, 52L, 41L, 
27L, 36L, 64L, 42L, 21L, 44L, 50L, 35L, 22L, 65L, 53L, 18L, 25L, 
59L, 56L, 52L, 39L, 40L, 25L, 63L, 43L, 23L, 52L, 48L, 24L, 45L, 
27L, 42L, 56L, 43L, 28L, 51L, 54L, 16L, 65L, 56L, 47L, 45L, 29L, 
41L, 52L, 50L, 26L, 44L, 35L, 55L, 57L, 43L, 52L, 28L, 33L, 20L, 
39L, 15L, 55L, 20L, 30L, 10L, 54L, 51L, 47L, 36L, 42L, 33L, 26L, 
29L, 19L, 22L, 22L, 22L, 40L, 33L, 20L, 43L, 53L, 25L, 25L, 49L, 
25L, 31L, 45L, 51L, 60L, 54L, 20L, 25L, 60L, 48L, 35L, 42L, 14L, 
28L, 55L, 20L, 35L, 17L, 46L, 20L, 45L, 37L, 33L, 36L, 60L, 47L, 
27L, 25L, 51L, 32L, 19L, 25L, 19L, 60L, 18L, 17L, 33L, 26L, 33L, 
32L, 33L, 22L, 17L, 24L, 43L, 38L, 27L, 40L, 42L, 41L, 31L, 43L, 
34L, 33L, 42L, 37L, 24L, 50L, 53L, 35L, 50L, 37L, 46L, 39L, 33L, 
56L, 58L, 23L, 31L, 52L, 50L, 33L, 56L, 55L, 20L, 22L, 44L, 50L, 
30L, 58L, 59L, 16L, 33L, 53L, 50L, 20L, 31L, 22L, 38L, 59L, 38L, 
62L, 52L, 30L, 18L, 53L, 38L, 41L, 44L, 53L, 19L, 53L, 57L), 
    Sous_Categorie = c("7", "7", "7", "7", "7", "7", "7", "7", 
    "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
    "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
    "7", "7", "7", "7,9", "8", "8", "8", "8", "8", "9", "9", 
    "11", "10,7", "10,8,9", "7", "7", "7", "7", "7", "7,8", "8", 
    "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "8", 
    "11", "7", "12", "12", "12", "12", "12", "12", "12", "12", 
    "12", "12", "12", "12", "12", "12", "12", "12", "13", "13", 
    "13", "13", "13", "14", "14", "14", "14", "14", "14", "14", 
    "14", "14", "14", "14", "14", "14", "14", "14", "14", "14", 
    "14", "14", "14", "15", "15", "15", "15", "15", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "18", "18", "18", "7,12", "7,12", 
    "7,12", "7,12", "7,13,17", "7,16", "7,17", "7,17", "7,17", 
    "7,17", "7,17", "8,17", "8,17", "11,17", "7,17", "7,17", 
    "8,17", "7,17", "7,17", "12,14", "12,15", "17,18")), .Names = c("Age", 
"Sous_Categorie"), row.names = c(NA, -215L), class = "data.frame")

使用with the help of stackoverflow member's的代码(并再次感谢)

    cats <- unique(unlist(strsplit(DF$Sous_Categorie, ",")))

cat_perc <- function(cats, vec) {
  # percentages
  nums <- sapply(cats, function(cat) sum(grepl(cat, vec)))
  perc <- nums/sum(nums)
  final <- perc * length(vec)
  df <- as.data.frame(as.list(final))
  names(df) <- cats
  return(df)
}

a=cat_perc(cats, DF$Sous_Categorie) 
a=(a/sum(a))*100

我获得的结果:

 7           9        8        11        10      12       13       14
46.20061 1.215805 4.863222 0.9118541 0.6079027 6.68693 1.823708 6.382979
    15       17       18        16
1.823708 27.96353 1.215805 0.3039514

根据原始数据DF,这些结果对我来说并不完全一致,很明显我有17比7。为什么我得到这样的结果?这是编码问题还是统计问题?

非常感谢

1 个答案:

答案 0 :(得分:1)

grepl更适合使用字符串而不是数字;在这种情况下,它会将17视为7的匹配项,您不需要它。您可以编写更复杂的正则表达式,但更容易将数据视为数字。

稍微改造一下,砍掉一些不必要的东西:

cat_perc <- function(vec) {
    # percentages
    nums <- table(as.numeric(unlist(strsplit(vec, ','))))
    perc <- nums/sum(nums)
    final <- perc * length(vec)
    final_pct <- final / sum(final) * 100
    return(final_pct)
}

cat_perc(DF$Sous_Categorie) 
#          7          8          9         10         11         12         13         14 
# 28.8065844  4.9382716  1.6460905  0.8230453  1.2345679  9.0534979  2.4691358  8.6419753 
#        15         16         17         18 
# 2.4691358  0.4115226 37.8600823  1.6460905 

或者,没有功能:

nums <- table(as.numeric(unlist(strsplit(DF$Sous_Categorie, ','))))
a <- data.frame(nums / sum(nums) * length(DF$Sous_Categorie))
a$Freq <- a$Freq / sum(a$Freq) * 100
a
#    Var1       Freq
# 1     7 28.8065844
# 2     8  4.9382716
# 3     9  1.6460905
# 4    10  0.8230453
# 5    11  1.2345679
# 6    12  9.0534979
# 7    13  2.4691358
# 8    14  8.6419753
# 9    15  2.4691358
# 10   16  0.4115226
# 11   17 37.8600823
# 12   18  1.6460905

根据您喜欢的格式添加或删除data.frame和子集。