根据其他列数据将新列添加到状态数据框

时间:2021-03-05 19:00:45

标签: r dataframe dplyr tidyverse

我正在尝试向 R 中的状态示例数据框添加一个新列。我希望此列将状态 ID 聚集到更广泛的类别 (1-4) 中。我的代码与我要查找的代码很接近,但我不太正确。我知道我可以逐行输入每个州 ID,但是有没有更快的方法?谢谢!

parameter = spark.read.option("multiline", "true").json("/at/dev_parameter.json")

kserver = parameter.select("broker")

ktopic = parameter.select("topicname")


df.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value")
   
.write

   .format("kafka")

   .outputMode("append")

   .option("kafka.bootstrap.servers", "f"+ **kserver**)

   .option("topic", "josn_data_topic",**ktopic** )

   .save()

#添加一列来表示每个状态

library(tidyverse)

#为状态桶创建新变量

States=state.x77 
States=data.frame(States)
States <- tibble::rowid_to_column(States, "ID")
States

2 个答案:

答案 0 :(得分:1)

如果您的所有组都将使用连续的 cut 值,您可以使用 findIntervalID

findInterval(States$ID, c(0, 12, 24, 37, 51))
#  [1] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4

如果你想让它更详细一点,你可以在 dplyr::between 中使用 case_when

States %>%
  mutate(
    WAGE_BUCKET = case_when(
      between(ID, 1, 12) ~ "1",
      between(ID, 13, 24) ~ "2",
      between(ID, 25, 37) ~ "3",
      between(ID, 38, 50) ~ "4",
      TRUE ~ NA_character_)
  )
#    ID Population Income Illiteracy Life Exp Murder HS Grad Frost   Area WAGE_BUCKET
# 1   1       3615   3624        2.1    69.05   15.1    41.3    20  50708           1
# 2   2        365   6315        1.5    69.31   11.3    66.7   152 566432           1
# 3   3       2212   4530        1.8    70.55    7.8    58.1    15 113417           1
# 4   4       2110   3378        1.9    70.66   10.1    39.9    65  51945           1
# 5   5      21198   5114        1.1    71.71   10.3    62.6    20 156361           1
# 6   6       2541   4884        0.7    72.06    6.8    63.9   166 103766           1
# 7   7       3100   5348        1.1    72.48    3.1    56.0   139   4862           1
# 8   8        579   4809        0.9    70.06    6.2    54.6   103   1982           1
# 9   9       8277   4815        1.3    70.66   10.7    52.6    11  54090           1
# 10 10       4931   4091        2.0    68.54   13.9    40.6    60  58073           1
# 11 11        868   4963        1.9    73.60    6.2    61.9     0   6425           1
# 12 12        813   4119        0.6    71.87    5.3    59.5   126  82677           1
# 13 13      11197   5107        0.9    70.14   10.3    52.6   127  55748           2
# 14 14       5313   4458        0.7    70.88    7.1    52.9   122  36097           2
# 15 15       2861   4628        0.5    72.56    2.3    59.0   140  55941           2
# 16 16       2280   4669        0.6    72.58    4.5    59.9   114  81787           2
# 17 17       3387   3712        1.6    70.10   10.6    38.5    95  39650           2
# 18 18       3806   3545        2.8    68.76   13.2    42.2    12  44930           2
# 19 19       1058   3694        0.7    70.39    2.7    54.7   161  30920           2
# 20 20       4122   5299        0.9    70.22    8.5    52.3   101   9891           2
# 21 21       5814   4755        1.1    71.83    3.3    58.5   103   7826           2
# 22 22       9111   4751        0.9    70.63   11.1    52.8   125  56817           2
# 23 23       3921   4675        0.6    72.96    2.3    57.6   160  79289           2
# 24 24       2341   3098        2.4    68.09   12.5    41.0    50  47296           2
# 25 25       4767   4254        0.8    70.69    9.3    48.8   108  68995           3
# 26 26        746   4347        0.6    70.56    5.0    59.2   155 145587           3
# 27 27       1544   4508        0.6    72.60    2.9    59.3   139  76483           3
# 28 28        590   5149        0.5    69.03   11.5    65.2   188 109889           3
# 29 29        812   4281        0.7    71.23    3.3    57.6   174   9027           3
# 30 30       7333   5237        1.1    70.93    5.2    52.5   115   7521           3
# 31 31       1144   3601        2.2    70.32    9.7    55.2   120 121412           3
# 32 32      18076   4903        1.4    70.55   10.9    52.7    82  47831           3
# 33 33       5441   3875        1.8    69.21   11.1    38.5    80  48798           3
# 34 34        637   5087        0.8    72.78    1.4    50.3   186  69273           3
# 35 35      10735   4561        0.8    70.82    7.4    53.2   124  40975           3
# 36 36       2715   3983        1.1    71.42    6.4    51.6    82  68782           3
# 37 37       2284   4660        0.6    72.13    4.2    60.0    44  96184           3
# 38 38      11860   4449        1.0    70.43    6.1    50.2   126  44966           4
# 39 39        931   4558        1.3    71.90    2.4    46.4   127   1049           4
# 40 40       2816   3635        2.3    67.96   11.6    37.8    65  30225           4
# 41 41        681   4167        0.5    72.08    1.7    53.3   172  75955           4
# 42 42       4173   3821        1.7    70.11   11.0    41.8    70  41328           4
# 43 43      12237   4188        2.2    70.90   12.2    47.4    35 262134           4
# 44 44       1203   4022        0.6    72.90    4.5    67.3   137  82096           4
# 45 45        472   3907        0.6    71.64    5.5    57.1   168   9267           4
# 46 46       4981   4701        1.4    70.08    9.5    47.8    85  39780           4
# 47 47       3559   4864        0.6    71.72    4.3    63.5    32  66570           4
# 48 48       1799   3617        1.4    69.48    6.7    41.6   100  24070           4
# 49 49       4589   4468        0.7    72.48    3.0    54.5   149  54464           4
# 50 50        376   4566        0.6    70.29    6.9    62.9   173  97203           4

答案 1 :(得分:0)

它是长度 > 1 的 vector。比较运算符适用于单个向量。我们可以使用 between

library(dplyr)
States <- States %>% 
   mutate(WAGE_BUCKET=case_when(between(ID, 1, 12) ~ '1',
                           between(ID, 13,24) ~ '2',
                           between(ID, 25,37) ~ '3',
                           between(ID, 38,50) ~ '4',
                          TRUE ~ NA_character_))

或者另一种选择是将 &><= 结合使用

States %>% 
  mutate(WAGE_BUCKET=case_when(ID >= 1 & ID <=12  ~ '1',
                           ID >= 13 & ID <= 24) ~ '2',
                           ID >= 25 & ID <= 37 ~ '3',
                           ID >= 38 & ID <= 50 ~ '4',
                          TRUE ~ NA_character))

或者可能是使用 %in%

的 OP
States %>% 
  mutate(WAGE_BUCKET=case_when(ID %in% c(1,12) ~ '1',
                               ID  %in% c(13,24) ~ '2',
                               ID %in% c(25,37) ~ '3',
                               ID %in% c(38,50) ~ '4',
                              TRUE ~ NA_character_))