下面是数据。
我想使用以下公式创建另一个变量:
Index_Value = 9.5 * sd(v)*(v-mvd)
其中
我想知道用于创建新变量的r代码,这对我来说有点难度。
ID District Value
9003050 B 43500
8377717 E 83000
8377719 E 50000
8377720 E 82000
8377722 E 500
8377725 E 82000
8377729 E 89000
8378166 E 39000
8378169 E 136800
8378173 E 62000
8378178 E 75900
8378179 E 1950
8378182 E 27000
8378183 E 4000
8394732 A 283500
8394733 A 32000
8394735 C 16300
8394737 C 66700
8394738 C 53500
8394742 C 66000
8394745 C 31500
8394746 C 55000
8410620 D 50000
8410621 D 48000
8410623 D 205100
8410625 D 45000
8410627 D 41200
8410631 D 21500
8410927 A 4500
8410931 A 8000
8410932 A 15500
8410934 A 2000
9007646 B 31000
9007648 B 17000
9007651 B 103000
9007654 B 10000
9007656 B 30000
9007657 B 52000
9007659 B 28100
9008060 B 205000
9008065 B 14000
9008067 B 17000
9008071 B 5600
9008074 B 19000
9008077 B 9200
9008080 B 9000
9008084 B 109000
9008087 B 93000
9008089 B 80000
9008091 B 9000
9898771 A 28000
9898777 C 83000
9898779 C 11500
9898780 C 1500
9898781 C 1500
9898782 C 2000
9898783 C 1500
9898784 C 1500
9898785 C 2500
9898786 D 500
9898787 D 10800
9898788 D 500
9898789 D 38000
9898791 D 1500
9898792 D 18000
9898793 D 4000
9898794 D 3000
9898829 A 1500
8415275 D 300000
8377893 E 3000
8377912 E 3000
8378022 E 228000
8378023 E 300000
8379506 E 1000
8379507 E 50000
8379508 E 129000
8379509 E 540000
8379511 E 293000
8380087 E 0
8380100 E 73000
8380107 E 11500
8380108 E 800
8380110 E 100000
8386212 E 50000
8386214 E 26000
8386217 E 3000
8386219 E 4000
8386229 E 3000
8394347 A 560000
8394348 A 335500
8394349 C 120000
8394351 C 210500
8394352 C 105500
8394353 C 105500
8394354 C 50000
8394355 C 50000
8395148 A 16000
8395151 A 312000
8395154 C 200000
8395159 A 5000
8395164 A 4000
8395173 A 2600
8395185 C 5400
8395191 C 700
8398108 C 6000
8398131 C 7000
8398134 C 1000
8398139 C 408000
8398150 A 3000
8398159 C 207000
8398170 A 2000
8410267 A 120500
8410269 A 53000
8410271 A 153000
8410272 A 1000
8410274 A 15000
8410275 A 227000
8410276 A 123000
8410277 A 50000
8410278 A 10000
8410569 D 26000
8410571 D 104000
8410572 D 50000
8410573 D 1000
8410575 D 14000
8410584 D 50000
8410585 D 20000
8410586 D 5000
8410587 D 50000
8415282 D 100000
8415283 D 12000
8415285 D 1000
8415289 D 300000
8415295 D 2000
9007548 B 100000
9007550 B 0
9007553 B 5000
9007555 B 0
9007557 B 50500
9007560 B 150000
9007562 B 50500
9007565 B 50000
9007569 B 50000
9009540 D 10000
8410289 60000
8410290 A 0
8410291 A 0
8410293 A 0
8410295 A 0
8410296 A 300000
8410297 A 80000
答案 0 :(得分:0)
使用dplyr
:
library(dplyr)
df %>%
group_by(District) %>%
mutate(Index_Value = 9.5*sd(Value)*(Value-mean(Value)))
<强>结果:强>
# A tibble: 151 x 4
# Groups: District [5]
ID District Value Index_Value
<int> <fctr> <int> <dbl>
1 9003050 B 43500 -2065363210
2 8377717 E 83000 3591979346
3 8377719 E 50000 -32688558091
4 8377720 E 82000 2492569120
5 8377722 E 500 -87109364245
6 8377725 E 82000 2492569120
7 8377729 E 89000 10188440698
8 8378166 E 39000 -44782070569
9 8378169 E 136800 62740249469
10 8378173 E 62000 -19495635387
# ... with 141 more rows
数据:强>
df = structure(list(ID = c(9003050L, 8377717L, 8377719L, 8377720L,
8377722L, 8377725L, 8377729L, 8378166L, 8378169L, 8378173L, 8378178L,
8378179L, 8378182L, 8378183L, 8394732L, 8394733L, 8394735L, 8394737L,
8394738L, 8394742L, 8394745L, 8394746L, 8410620L, 8410621L, 8410623L,
8410625L, 8410627L, 8410631L, 8410927L, 8410931L, 8410932L, 8410934L,
9007646L, 9007648L, 9007651L, 9007654L, 9007656L, 9007657L, 9007659L,
9008060L, 9008065L, 9008067L, 9008071L, 9008074L, 9008077L, 9008080L,
9008084L, 9008087L, 9008089L, 9008091L, 9898771L, 9898777L, 9898779L,
9898780L, 9898781L, 9898782L, 9898783L, 9898784L, 9898785L, 9898786L,
9898787L, 9898788L, 9898789L, 9898791L, 9898792L, 9898793L, 9898794L,
9898829L, 8415275L, 8377893L, 8377912L, 8378022L, 8378023L, 8379506L,
8379507L, 8379508L, 8379509L, 8379511L, 8380087L, 8380100L, 8380107L,
8380108L, 8380110L, 8386212L, 8386214L, 8386217L, 8386219L, 8386229L,
8394347L, 8394348L, 8394349L, 8394351L, 8394352L, 8394353L, 8394354L,
8394355L, 8395148L, 8395151L, 8395154L, 8395159L, 8395164L, 8395173L,
8395185L, 8395191L, 8398108L, 8398131L, 8398134L, 8398139L, 8398150L,
8398159L, 8398170L, 8410267L, 8410269L, 8410271L, 8410272L, 8410274L,
8410275L, 8410276L, 8410277L, 8410278L, 8410569L, 8410571L, 8410572L,
8410573L, 8410575L, 8410584L, 8410585L, 8410586L, 8410587L, 8415282L,
8415283L, 8415285L, 8415289L, 8415295L, 9007548L, 9007550L, 9007553L,
9007555L, 9007557L, 9007560L, 9007562L, 9007565L, 9007569L, 9009540L,
8410289L, 8410290L, 8410291L, 8410293L, 8410295L, 8410296L, 8410297L
), District = structure(c(2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 4L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A",
"B", "C", "D", "E"), class = "factor"), Value = c(43500L, 83000L,
50000L, 82000L, 500L, 82000L, 89000L, 39000L, 136800L, 62000L,
75900L, 1950L, 27000L, 4000L, 283500L, 32000L, 16300L, 66700L,
53500L, 66000L, 31500L, 55000L, 50000L, 48000L, 205100L, 45000L,
41200L, 21500L, 4500L, 8000L, 15500L, 2000L, 31000L, 17000L,
103000L, 10000L, 30000L, 52000L, 28100L, 205000L, 14000L, 17000L,
5600L, 19000L, 9200L, 9000L, 109000L, 93000L, 80000L, 9000L,
28000L, 83000L, 11500L, 1500L, 1500L, 2000L, 1500L, 1500L, 2500L,
500L, 10800L, 500L, 38000L, 1500L, 18000L, 4000L, 3000L, 1500L,
300000L, 3000L, 3000L, 228000L, 300000L, 1000L, 50000L, 129000L,
540000L, 293000L, 0L, 73000L, 11500L, 800L, 100000L, 50000L,
26000L, 3000L, 4000L, 3000L, 560000L, 335500L, 120000L, 210500L,
105500L, 105500L, 50000L, 50000L, 16000L, 312000L, 200000L, 5000L,
4000L, 2600L, 5400L, 700L, 6000L, 7000L, 1000L, 408000L, 3000L,
207000L, 2000L, 120500L, 53000L, 153000L, 1000L, 15000L, 227000L,
123000L, 50000L, 10000L, 26000L, 104000L, 50000L, 1000L, 14000L,
50000L, 20000L, 5000L, 50000L, 100000L, 12000L, 1000L, 300000L,
2000L, 100000L, 0L, 5000L, 0L, 50500L, 150000L, 50500L, 50000L,
50000L, 10000L, 60000L, 0L, 0L, 0L, 0L, 300000L, 80000L)), .Names = c("ID",
"District", "Value"), class = "data.frame", row.names = c(NA,
-151L))
答案 1 :(得分:-1)
在阅读提供的数据时很难。
aa <- [data as text in question]
df <- read.delim(text = aa, sep = ' ', header = TRUE, stringsAsFactors = FALSE)
df <- df[,c(1,2,5)]
colnames(df) <- c('ID','District','Value')
# Line 145 is missing a District
df <- df[-145,]
mean_sd_df <- df %>% group_by(District) %>% summarize(mean = mean(Value), sd = sd(Value))
mean_sd_df
# # A tibble: 5 x 3
# District mean sd
# <chr> <dbl> <dbl>
# A 85862.50 136433.66
# B 47871.43 49733.55
# C 66807.14 92979.60
# D 51070.00 79781.11
# E 79732.81 115727.39
for(i in 1:nrow(df)){
df$Index_Value[i] <- 9.5 * mean_sd_df[mean_sd_df$District == df$District[i], 'sd'] * (df$Value[i] - mean_sd_df[mean_sd_df$District == df$District[i], 'mean'])
}
head(df)
# ID District Value Index_Value
# 9003050 B 43500 -2065363210
# 8377717 E 83000 3591979346
# 8377719 E 50000 -32688558091
# 8377720 E 82000 2492569120
# 8377722 E 500 -87109364245
# 8377725 E 82000 2492569120
要进行验证,请参阅df的第1行:
9.5 * 49733.55 *(43500 - 47871.43)= -2065363958.52675 [或R为基于未打印的所有数字计算的舍入误差]