从现有变量创建新变量

时间:2017-10-25 15:00:53

标签: r dataframe

下面是数据。

  • ID是为每个单位指定的识别号,
  • 区域是获取数据的区域的代码
  • 值是感兴趣的变量。

我想使用以下公式创建另一个变量:

  

Index_Value = 9.5 * sd(v)*(v-mvd)

其中

  • sd(v)=获取数据的特定区域的价值的标准差,
  • v = value,
  • mvd =获取数据的特定区域的值的平均值。

我想知道用于创建新变量的r代码,这对我来说有点难度。

ID  District    Value
9003050 B   43500
8377717 E   83000
8377719 E   50000
8377720 E   82000
8377722 E   500
8377725 E   82000
8377729 E   89000
8378166 E   39000
8378169 E   136800
8378173 E   62000
8378178 E   75900
8378179 E   1950
8378182 E   27000
8378183 E   4000
8394732 A   283500
8394733 A   32000
8394735 C   16300
8394737 C   66700
8394738 C   53500
8394742 C   66000
8394745 C   31500
8394746 C   55000
8410620 D   50000
8410621 D   48000
8410623 D   205100
8410625 D   45000
8410627 D   41200
8410631 D   21500
8410927 A   4500
8410931 A   8000
8410932 A   15500
8410934 A   2000
9007646 B   31000
9007648 B   17000
9007651 B   103000
9007654 B   10000
9007656 B   30000
9007657 B   52000
9007659 B   28100
9008060 B   205000
9008065 B   14000
9008067 B   17000
9008071 B   5600
9008074 B   19000
9008077 B   9200
9008080 B   9000
9008084 B   109000
9008087 B   93000
9008089 B   80000
9008091 B   9000
9898771 A   28000
9898777 C   83000
9898779 C   11500
9898780 C   1500
9898781 C   1500
9898782 C   2000
9898783 C   1500
9898784 C   1500
9898785 C   2500
9898786 D   500
9898787 D   10800
9898788 D   500
9898789 D   38000
9898791 D   1500
9898792 D   18000
9898793 D   4000
9898794 D   3000
9898829 A   1500
8415275 D   300000
8377893 E   3000
8377912 E   3000
8378022 E   228000
8378023 E   300000
8379506 E   1000
8379507 E   50000
8379508 E   129000
8379509 E   540000
8379511 E   293000
8380087 E   0
8380100 E   73000
8380107 E   11500
8380108 E   800
8380110 E   100000
8386212 E   50000
8386214 E   26000
8386217 E   3000
8386219 E   4000
8386229 E   3000
8394347 A   560000
8394348 A   335500
8394349 C   120000
8394351 C   210500
8394352 C   105500
8394353 C   105500
8394354 C   50000
8394355 C   50000
8395148 A   16000
8395151 A   312000
8395154 C   200000
8395159 A   5000
8395164 A   4000
8395173 A   2600
8395185 C   5400
8395191 C   700
8398108 C   6000
8398131 C   7000
8398134 C   1000
8398139 C   408000
8398150 A   3000
8398159 C   207000
8398170 A   2000
8410267 A   120500
8410269 A   53000
8410271 A   153000
8410272 A   1000
8410274 A   15000
8410275 A   227000
8410276 A   123000
8410277 A   50000
8410278 A   10000
8410569 D   26000
8410571 D   104000
8410572 D   50000
8410573 D   1000
8410575 D   14000
8410584 D   50000
8410585 D   20000
8410586 D   5000
8410587 D   50000
8415282 D   100000
8415283 D   12000
8415285 D   1000
8415289 D   300000
8415295 D   2000
9007548 B   100000
9007550 B   0
9007553 B   5000
9007555 B   0
9007557 B   50500
9007560 B   150000
9007562 B   50500
9007565 B   50000
9007569 B   50000
9009540 D   10000
8410289     60000
8410290 A   0
8410291 A   0
8410293 A   0
8410295 A   0
8410296 A   300000
8410297 A   80000

2 个答案:

答案 0 :(得分:0)

使用dplyr

可以轻松完成此操作
library(dplyr)

df %>%
  group_by(District) %>%
  mutate(Index_Value = 9.5*sd(Value)*(Value-mean(Value)))

<强>结果:

# A tibble: 151 x 4
# Groups:   District [5]
        ID District  Value  Index_Value
     <int>   <fctr>  <int>        <dbl>
 1 9003050        B  43500  -2065363210
 2 8377717        E  83000   3591979346
 3 8377719        E  50000 -32688558091
 4 8377720        E  82000   2492569120
 5 8377722        E    500 -87109364245
 6 8377725        E  82000   2492569120
 7 8377729        E  89000  10188440698
 8 8378166        E  39000 -44782070569
 9 8378169        E 136800  62740249469
10 8378173        E  62000 -19495635387
# ... with 141 more rows

数据:

df = structure(list(ID = c(9003050L, 8377717L, 8377719L, 8377720L, 
8377722L, 8377725L, 8377729L, 8378166L, 8378169L, 8378173L, 8378178L, 
8378179L, 8378182L, 8378183L, 8394732L, 8394733L, 8394735L, 8394737L, 
8394738L, 8394742L, 8394745L, 8394746L, 8410620L, 8410621L, 8410623L, 
8410625L, 8410627L, 8410631L, 8410927L, 8410931L, 8410932L, 8410934L, 
9007646L, 9007648L, 9007651L, 9007654L, 9007656L, 9007657L, 9007659L, 
9008060L, 9008065L, 9008067L, 9008071L, 9008074L, 9008077L, 9008080L, 
9008084L, 9008087L, 9008089L, 9008091L, 9898771L, 9898777L, 9898779L, 
9898780L, 9898781L, 9898782L, 9898783L, 9898784L, 9898785L, 9898786L, 
9898787L, 9898788L, 9898789L, 9898791L, 9898792L, 9898793L, 9898794L, 
9898829L, 8415275L, 8377893L, 8377912L, 8378022L, 8378023L, 8379506L, 
8379507L, 8379508L, 8379509L, 8379511L, 8380087L, 8380100L, 8380107L, 
8380108L, 8380110L, 8386212L, 8386214L, 8386217L, 8386219L, 8386229L, 
8394347L, 8394348L, 8394349L, 8394351L, 8394352L, 8394353L, 8394354L, 
8394355L, 8395148L, 8395151L, 8395154L, 8395159L, 8395164L, 8395173L, 
8395185L, 8395191L, 8398108L, 8398131L, 8398134L, 8398139L, 8398150L, 
8398159L, 8398170L, 8410267L, 8410269L, 8410271L, 8410272L, 8410274L, 
8410275L, 8410276L, 8410277L, 8410278L, 8410569L, 8410571L, 8410572L, 
8410573L, 8410575L, 8410584L, 8410585L, 8410586L, 8410587L, 8415282L, 
8415283L, 8415285L, 8415289L, 8415295L, 9007548L, 9007550L, 9007553L, 
9007555L, 9007557L, 9007560L, 9007562L, 9007565L, 9007569L, 9009540L, 
8410289L, 8410290L, 8410291L, 8410293L, 8410295L, 8410296L, 8410297L
), District = structure(c(2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 4L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 
1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 
3L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", 
"B", "C", "D", "E"), class = "factor"), Value = c(43500L, 83000L, 
50000L, 82000L, 500L, 82000L, 89000L, 39000L, 136800L, 62000L, 
75900L, 1950L, 27000L, 4000L, 283500L, 32000L, 16300L, 66700L, 
53500L, 66000L, 31500L, 55000L, 50000L, 48000L, 205100L, 45000L, 
41200L, 21500L, 4500L, 8000L, 15500L, 2000L, 31000L, 17000L, 
103000L, 10000L, 30000L, 52000L, 28100L, 205000L, 14000L, 17000L, 
5600L, 19000L, 9200L, 9000L, 109000L, 93000L, 80000L, 9000L, 
28000L, 83000L, 11500L, 1500L, 1500L, 2000L, 1500L, 1500L, 2500L, 
500L, 10800L, 500L, 38000L, 1500L, 18000L, 4000L, 3000L, 1500L, 
300000L, 3000L, 3000L, 228000L, 300000L, 1000L, 50000L, 129000L, 
540000L, 293000L, 0L, 73000L, 11500L, 800L, 100000L, 50000L, 
26000L, 3000L, 4000L, 3000L, 560000L, 335500L, 120000L, 210500L, 
105500L, 105500L, 50000L, 50000L, 16000L, 312000L, 200000L, 5000L, 
4000L, 2600L, 5400L, 700L, 6000L, 7000L, 1000L, 408000L, 3000L, 
207000L, 2000L, 120500L, 53000L, 153000L, 1000L, 15000L, 227000L, 
123000L, 50000L, 10000L, 26000L, 104000L, 50000L, 1000L, 14000L, 
50000L, 20000L, 5000L, 50000L, 100000L, 12000L, 1000L, 300000L, 
2000L, 100000L, 0L, 5000L, 0L, 50500L, 150000L, 50500L, 50000L, 
50000L, 10000L, 60000L, 0L, 0L, 0L, 0L, 300000L, 80000L)), .Names = c("ID", 
"District", "Value"), class = "data.frame", row.names = c(NA, 
-151L))

答案 1 :(得分:-1)

在阅读提供的数据时很难。

aa <- [data as text in question]

df <- read.delim(text = aa, sep = ' ', header = TRUE, stringsAsFactors = FALSE)

df <- df[,c(1,2,5)]
colnames(df) <- c('ID','District','Value')

# Line 145 is missing a District
df <- df[-145,]

mean_sd_df <- df %>% group_by(District) %>% summarize(mean = mean(Value), sd = sd(Value))

mean_sd_df

# # A tibble: 5 x 3
# District     mean        sd
# <chr>        <dbl>       <dbl>
# A            85862.50    136433.66
# B            47871.43    49733.55
# C            66807.14    92979.60
# D            51070.00    79781.11
# E            79732.81    115727.39

for(i in 1:nrow(df)){

    df$Index_Value[i] <- 9.5 * mean_sd_df[mean_sd_df$District == df$District[i], 'sd'] * (df$Value[i] - mean_sd_df[mean_sd_df$District == df$District[i], 'mean'])

}
head(df)

# ID      District Value  Index_Value
# 9003050        B 43500  -2065363210
# 8377717        E 83000   3591979346
# 8377719        E 50000 -32688558091
# 8377720        E 82000   2492569120
# 8377722        E   500 -87109364245
# 8377725        E 82000   2492569120

要进行验证,请参阅df的第1行:

  

9.5 * 49733.55 *(43500 - 47871.43)= -2065363958.52675 [或R为基于未打印的所有数字计算的舍入误差]