假定数据帧存储为fruit
,并且格式如下:
State Fruit Category Fruit Type Gross Value
ACT CitrusFruit Mandarins $4,500,000
ACT CitrusFruit Oranges
NSW PomeFruit Apple $139,130,203.50
NSW Grapes Wine Production $50,000,000
NSW OrchardStoneFruit Avocados $10,031,123
QLD CitrusFruit Oranges
dput(fruit)
的输出
structure(list(State = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L,
8L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L,
8L), .Label = c("ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC",
"WA"), class = "factor"), Fruit.Category = structure(c(6L, 6L,
6L, 8L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L), .Label = c(" Grapes ", " OrchardStoneFruit ", " OtherFruit ",
" PomeFruit ", " CitrusFruit ", " CitrusFruit ", " Grapes ",
" Grapes ", " OrchardStoneFruit ", " OtherFruit ", " PomeFruit "
), class = "factor"), Fruit.Type = structure(c(5L, 8L, 13L, 18L,
31L, 2L, 4L, 6L, 7L, 9L, 14L, 17L, 3L, 11L, 12L, 15L, 1L, 10L,
16L, 13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L,
13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L, 18L, 31L, 18L, 31L,
18L, 31L, 18L, 31L, 18L, 31L, 18L, 31L, 18L, 31L, 14L, 17L, 20L,
22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L, 14L, 17L,
20L, 22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L, 14L,
17L, 20L, 22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L,
14L, 17L, 20L, 22L, 24L, 25L, 27L, 15L, 21L, 29L, 30L, 15L, 21L,
29L, 30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L,
30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L, 30L, 16L, 19L, 28L, 16L,
19L, 28L, 16L, 19L, 28L, 16L, 19L, 28L, 16L, 19L, 28L, 16L, 19L,
28L, 16L, 19L, 28L), .Label = c(" Apples ", " Avocados ",
" Bananas ", " Cherries ", " Mandarins ", " Mangoes ",
" Nectarines ", " Oranges ", " Peaches ", " Pears ",
" Pineapples ", " Strawberries ", " AllOtherCitrusFruit ",
" AllOtherOrchardFruit ", " AllOtherOtherFruit ", " AllOtherPomeFruit ",
" AllOtherStoneFruit ", " AllOtherUses ", " Apples ", " Avocados ",
" Bananas ", " Cherries ", " Mandarins ", " Mangoes ", " Nectarines ",
" Oranges ", " Peaches ", " Pears ", " Pineapples ", " Strawberries ",
" WineProduction "), class = "factor"), Gross.Value = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 60L, 97L, 23L, 104L, 1L, 1L, 56L, 98L, 36L, 101L, 68L,
11L, 1L, 1L, 1L, 91L, 96L, 57L, 99L, 92L, 21L, 71L, 29L, 48L,
1L, 76L, 51L, 46L, 58L, 1L, 34L, 37L, 14L, 22L, 70L, 18L, 59L,
28L, 32L, 41L, 83L, 61L, 69L, 30L, 1L, 1L, 26L, 1L, 1L, 25L,
35L, 19L, 2L, 80L, 9L, 8L, 7L, 102L, 47L, 31L, 1L, 85L, 75L,
1L, 88L, 93L, 52L, 1L, 66L, 50L, 100L, 43L, 89L, 95L, 2L, 82L,
65L, 5L, 24L, 94L, 33L, 64L, 10L, 90L, 78L, 84L, 62L, 3L, 86L,
20L, 73L, 1L, 38L, 67L, 72L, 15L, 63L, 1L, 1L, 39L, 17L, 1L,
1L, 16L, 40L, 1L, 1L, 103L, 79L, 49L, 1L, 44L, 6L, 105L, 53L,
1L, 1L, 1L, 1L, 81L, 54L, 27L, 87L, 13L, 1L, 55L, 106L, 4L, 42L,
12L, 45L, 77L, 74L), .Label = c("", "$0.00", "$1,025,861.63",
"$1,107,476.82", "$1,135,055.74", "$1,148,385.97", "$1,514,089.93",
"$1,539,762.85", "$1,565,234.83", "$10,469,580.98", "$100,622,922.20",
"$106,039,956.40", "$11,648,561.35", "$113,930,475.80", "$114,195,162.80",
"$12,169,338.44", "$12,492,792.64", "$12,843,528.01", "$120,877,197.60",
"$13,245.08", "$13,331,668.11", "$13,981,075.51", "$130,258,416.50",
"$14,203,578.43", "$14,697,408.09", "$15,085,825.24", "$15,196.71",
"$15,246,349.76", "$154,858,589.30", "$168,325.78", "$17,661,100.37",
"$18,278,371.16", "$188,414.59", "$19,896,312.15", "$2,370,402.03",
"$2,557,589.86", "$209,648,663.50", "$21,426,350.11", "$22,482,034.46",
"$23,929,331.35", "$238,668.61", "$249,675,376.10", "$26,669,599.23",
"$27,540,236.71", "$270,903.84", "$3,485,520.14", "$3,520,605.89",
"$3,659,706.68", "$3,829,198.67", "$301,644.66", "$301,976.25",
"$31,133,715.88", "$313,144.86", "$334,363.30", "$35,212,772.81",
"$37,927,507.70", "$38,989,343.33", "$385,858,491.60", "$4,447,813.26",
"$4,549,208.46", "$4,569,373.00", "$4,702.20", "$4,712,329.56",
"$4,995,833.14", "$40,133,037.39", "$40,481.05", "$435,712,531.70",
"$44,434,103.55", "$443,017.10", "$45,665,029.35", "$45,888,545.67",
"$46,638,011.92", "$47,589.51", "$5,793,841.42", "$5,854,982.37",
"$51,534,636.09", "$53,367,548.56", "$53,377,925.45", "$555,799.71",
"$57,522,144.94", "$57,930,562.37", "$58,316,912.75", "$6,170,170.78",
"$6,791,088.95", "$6,824,520.08", "$623,030.52", "$63,493,163.21",
"$664,237.23", "$7,066,407.60", "$7,168,380.92", "$7,364,245.36",
"$7,426,224.28", "$7,894.54", "$70,218,810.35", "$76,591,000.57",
"$8,596,626.45", "$8,713,417.54", "$85,876,834.41", "$873,748.40",
"$9,262,889.69", "$9,731,658.36", "$9,991,440.81", "$91,781,453.44",
"$92,299.72", "$95,677,012.68", "$983,780.33"), class = "factor")), class = "data.frame", row.names = c(NA,
-152L))
我正在尝试根据“水果类别”对毛值求和,并为其使用了以下代码:
fruit %>%
mutate(Gross.Value = as.numeric(gsub("[^0-9.]", "", as.character(Gross.Value)))) %>%
group_by(Fruit.Category) %>%
summarize(Gross.Value = sum(Gross.Value, na.rm=TRUE))
但是,这会导致输出看起来像这样:
A tibble: 11 x 2
Fruit.Category Gross.Value
<fct> <dbl>
1 " Grapes " 0
2 " OrchardStoneFruit " 0
3 " OtherFruit " 0
4 " PomeFruit " 0
5 " CitrusFruit " 501345814.
6 " CitrusFruit " 0
7 " Grapes " 1048709022.
8 " Grapes " 0
9 " OrchardStoneFruit " 679997807.
10 " OtherFruit " 879348015.
11 " PomeFruit " 683012047.
如何更改输出,以便可以删除引号以及任何尾随或前导空格。本质上,只需整理文本即可。
此外,关于我将如何以降序显示输出(基于总值)的任何建议将不胜感激。我知道的唯一方法是添加:
%>% arrange(desc(n))
代码末尾的。但是,这似乎不起作用。
答案 0 :(得分:3)
最后一个问题的继续:-)
fruit %>%
mutate_if(~is.factor(.) | is.character(.), ~trimws(as.character(.))) %>%
mutate(Gross.Value = as.numeric(gsub("[^0-9.]", "", Gross.Value))) %>%
group_by(Fruit.Category) %>%
summarize(Gross.Value = sum(Gross.Value, na.rm=TRUE)) %>%
arrange(desc(Gross.Value))
# # A tibble: 5 x 2
# Fruit.Category Gross.Value
# <chr> <dbl>
# 1 Grapes 1048709022.
# 2 OtherFruit 879348015.
# 3 PomeFruit 683012047.
# 4 OrchardStoneFruit 679997807.
# 5 CitrusFruit 501345814.
由于我们在汇总之前会修剪掉多余的空格,因此我们能够减少一些不正确的唯一性。
答案的实质在管道的第一行:
mutate_if(~is.factor(.) | is.character(.), ~trimws(as.character(.))) %>%
mutate_if
说“突变满足特定条件的所有列”。在这种情况下,我将其限制为character
或准字符factor
的列(因为将已为数值的列转换为character
不太好)。 / p>
从那里,计划到arrange(desc(Gross.Value))
。 (不确定arrange(desc(n))
的来源...)