整理文本以删除输出中不需要的字符

时间:2018-10-18 22:25:20

标签: r

假定数据帧存储为fruit,并且格式如下:

State           Fruit Category         Fruit Type         Gross Value
ACT             CitrusFruit            Mandarins          $4,500,000
ACT             CitrusFruit            Oranges            
NSW             PomeFruit              Apple              $139,130,203.50
NSW             Grapes                 Wine Production    $50,000,000
NSW             OrchardStoneFruit      Avocados           $10,031,123
QLD             CitrusFruit            Oranges

dput(fruit)的输出

structure(list(State = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 
8L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 2L, 2L, 2L, 3L, 
3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 
8L), .Label = c("ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC", 
"WA"), class = "factor"), Fruit.Category = structure(c(6L, 6L, 
6L, 8L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L), .Label = c("  Grapes  ", "  OrchardStoneFruit  ", "  OtherFruit  ", 
"  PomeFruit  ", " CitrusFruit ", " CitrusFruit  ", " Grapes ", 
" Grapes  ", " OrchardStoneFruit ", " OtherFruit ", " PomeFruit "
), class = "factor"), Fruit.Type = structure(c(5L, 8L, 13L, 18L, 
31L, 2L, 4L, 6L, 7L, 9L, 14L, 17L, 3L, 11L, 12L, 15L, 1L, 10L, 
16L, 13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L, 
13L, 23L, 26L, 13L, 23L, 26L, 13L, 23L, 26L, 18L, 31L, 18L, 31L, 
18L, 31L, 18L, 31L, 18L, 31L, 18L, 31L, 18L, 31L, 14L, 17L, 20L, 
22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L, 14L, 17L, 
20L, 22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L, 14L, 
17L, 20L, 22L, 24L, 25L, 27L, 14L, 17L, 20L, 22L, 24L, 25L, 27L, 
14L, 17L, 20L, 22L, 24L, 25L, 27L, 15L, 21L, 29L, 30L, 15L, 21L, 
29L, 30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L, 
30L, 15L, 21L, 29L, 30L, 15L, 21L, 29L, 30L, 16L, 19L, 28L, 16L, 
19L, 28L, 16L, 19L, 28L, 16L, 19L, 28L, 16L, 19L, 28L, 16L, 19L, 
28L, 16L, 19L, 28L), .Label = c("  Apples  ", "  Avocados  ", 
"  Bananas  ", "  Cherries  ", "  Mandarins  ", "  Mangoes  ", 
"  Nectarines  ", "  Oranges  ", "  Peaches  ", "  Pears  ", 
"  Pineapples  ", "  Strawberries  ", " AllOtherCitrusFruit ", 
" AllOtherOrchardFruit ", " AllOtherOtherFruit ", " AllOtherPomeFruit ", 
" AllOtherStoneFruit ", " AllOtherUses ", " Apples ", " Avocados ", 
" Bananas ", " Cherries ", " Mandarins ", " Mangoes ", " Nectarines ", 
" Oranges ", " Peaches ", " Pears ", " Pineapples ", " Strawberries ", 
" WineProduction "), class = "factor"), Gross.Value = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 60L, 97L, 23L, 104L, 1L, 1L, 56L, 98L, 36L, 101L, 68L, 
11L, 1L, 1L, 1L, 91L, 96L, 57L, 99L, 92L, 21L, 71L, 29L, 48L, 
1L, 76L, 51L, 46L, 58L, 1L, 34L, 37L, 14L, 22L, 70L, 18L, 59L, 
28L, 32L, 41L, 83L, 61L, 69L, 30L, 1L, 1L, 26L, 1L, 1L, 25L, 
35L, 19L, 2L, 80L, 9L, 8L, 7L, 102L, 47L, 31L, 1L, 85L, 75L, 
1L, 88L, 93L, 52L, 1L, 66L, 50L, 100L, 43L, 89L, 95L, 2L, 82L, 
65L, 5L, 24L, 94L, 33L, 64L, 10L, 90L, 78L, 84L, 62L, 3L, 86L, 
20L, 73L, 1L, 38L, 67L, 72L, 15L, 63L, 1L, 1L, 39L, 17L, 1L, 
1L, 16L, 40L, 1L, 1L, 103L, 79L, 49L, 1L, 44L, 6L, 105L, 53L, 
1L, 1L, 1L, 1L, 81L, 54L, 27L, 87L, 13L, 1L, 55L, 106L, 4L, 42L, 
12L, 45L, 77L, 74L), .Label = c("", "$0.00", "$1,025,861.63", 
"$1,107,476.82", "$1,135,055.74", "$1,148,385.97", "$1,514,089.93", 
"$1,539,762.85", "$1,565,234.83", "$10,469,580.98", "$100,622,922.20", 
"$106,039,956.40", "$11,648,561.35", "$113,930,475.80", "$114,195,162.80", 
"$12,169,338.44", "$12,492,792.64", "$12,843,528.01", "$120,877,197.60", 
"$13,245.08", "$13,331,668.11", "$13,981,075.51", "$130,258,416.50", 
"$14,203,578.43", "$14,697,408.09", "$15,085,825.24", "$15,196.71", 
"$15,246,349.76", "$154,858,589.30", "$168,325.78", "$17,661,100.37", 
"$18,278,371.16", "$188,414.59", "$19,896,312.15", "$2,370,402.03", 
"$2,557,589.86", "$209,648,663.50", "$21,426,350.11", "$22,482,034.46", 
"$23,929,331.35", "$238,668.61", "$249,675,376.10", "$26,669,599.23", 
"$27,540,236.71", "$270,903.84", "$3,485,520.14", "$3,520,605.89", 
"$3,659,706.68", "$3,829,198.67", "$301,644.66", "$301,976.25", 
"$31,133,715.88", "$313,144.86", "$334,363.30", "$35,212,772.81", 
"$37,927,507.70", "$38,989,343.33", "$385,858,491.60", "$4,447,813.26", 
"$4,549,208.46", "$4,569,373.00", "$4,702.20", "$4,712,329.56", 
"$4,995,833.14", "$40,133,037.39", "$40,481.05", "$435,712,531.70", 
"$44,434,103.55", "$443,017.10", "$45,665,029.35", "$45,888,545.67", 
"$46,638,011.92", "$47,589.51", "$5,793,841.42", "$5,854,982.37", 
"$51,534,636.09", "$53,367,548.56", "$53,377,925.45", "$555,799.71", 
"$57,522,144.94", "$57,930,562.37", "$58,316,912.75", "$6,170,170.78", 
"$6,791,088.95", "$6,824,520.08", "$623,030.52", "$63,493,163.21", 
"$664,237.23", "$7,066,407.60", "$7,168,380.92", "$7,364,245.36", 
"$7,426,224.28", "$7,894.54", "$70,218,810.35", "$76,591,000.57", 
"$8,596,626.45", "$8,713,417.54", "$85,876,834.41", "$873,748.40", 
"$9,262,889.69", "$9,731,658.36", "$9,991,440.81", "$91,781,453.44", 
"$92,299.72", "$95,677,012.68", "$983,780.33"), class = "factor")), class = "data.frame", row.names = c(NA, 
-152L))

我正在尝试根据“水果类别”对毛值求和,并为其使用了以下代码:

fruit %>%
  mutate(Gross.Value = as.numeric(gsub("[^0-9.]", "", as.character(Gross.Value)))) %>%
  group_by(Fruit.Category) %>%
  summarize(Gross.Value = sum(Gross.Value, na.rm=TRUE))

但是,这会导致输出看起来像这样:

  A tibble: 11 x 2
    Fruit.Category          Gross.Value
    <fct>                         <dbl>
  1 "  Grapes  "                     0 
  2 "  OrchardStoneFruit  "          0 
  3 "  OtherFruit  "                 0 
  4 "  PomeFruit  "                  0 
  5 " CitrusFruit "          501345814.
  6 " CitrusFruit  "                 0 
  7 " Grapes "              1048709022.
  8 " Grapes  "                      0 
  9 " OrchardStoneFruit "    679997807.
 10 " OtherFruit "           879348015.
 11 " PomeFruit "            683012047.

如何更改输出,以便可以删除引号以及任何尾随或前导空格。本质上,只需整理文本即可。

此外,关于我将如何以降序显示输出(基于总值)的任何建议将不胜感激。我知道的唯一方法是添加:

%>% arrange(desc(n))
代码末尾的

。但是,这似乎不起作用。

1 个答案:

答案 0 :(得分:3)

最后一个问题的继续:-)

fruit %>%
  mutate_if(~is.factor(.) | is.character(.), ~trimws(as.character(.))) %>%
  mutate(Gross.Value = as.numeric(gsub("[^0-9.]", "", Gross.Value))) %>%
  group_by(Fruit.Category) %>%
  summarize(Gross.Value = sum(Gross.Value, na.rm=TRUE)) %>%
  arrange(desc(Gross.Value))
# # A tibble: 5 x 2
#   Fruit.Category    Gross.Value
#   <chr>                   <dbl>
# 1 Grapes            1048709022.
# 2 OtherFruit         879348015.
# 3 PomeFruit          683012047.
# 4 OrchardStoneFruit  679997807.
# 5 CitrusFruit        501345814.

由于我们在汇总之前会修剪掉多余的空格,因此我们能够减少一些不正确的唯一性。

答案的实质在管道的第一行:

  mutate_if(~is.factor(.) | is.character(.), ~trimws(as.character(.))) %>%

mutate_if说“突变满足特定条件的所有列”。在这种情况下,我将其限制为character或准字符factor的列(因为将已为数值的列转换为character不太好)。 / p>

从那里,计划到arrange(desc(Gross.Value))。 (不确定arrange(desc(n))的来源...)