我从csv创建了三个数据表,这些数据表位于此公共存储桶https://console.cloud.google.com/storage/browser/securitas
中data_total = "source_year", "state", "is_male", "child_race", "weight_pounds" with more than 1M rows
race = (id_race=c(1,2,3,4,5,6,7,9,18,28,39,48),race=c("white","black","american indian","chinese","japanese","hawaiian","filipino","unknown/other","asian indian","korean","samoan","vietnamese"))
sex = (id_sex=c(false,true), de_sex=c("female","male"))
我需要使用以下各列创建一个新的数据表: -状态 -每个州70年代的总出生人数 -每个州80年代的总出生人数 -每个州90年代的总出生人数 -每个州的总出生数为00 -每个州的70多岁的种族(文字) -每个州80年代出生较多的种族(文字) -每个州90年代出生较多的种族(文字) -每个州的出生数为00的种族(文字) -每个州的男性总数 -每个州的女性总数 -每个州的平均重量(公斤)
我使用以下命令创建数据数据表:
data_total = rbindlist(lapply(list.files(path=".", pattern="natalidad*"), data.table::fread , header = T, verbose = F, sep = ',', select = c("source_year", "state", "is_male", "child_race", "weight_pounds")))
,然后添加新的数据表以供以后合并:
b70 = setnames(data_total[source_year %between% c(1970,1979), .N, keyby = state],"N","b70")
b80 = setnames(data_total[source_year %between% c(1980,1989), .N, keyby = state],"N","b80")
b90 = setnames(data_total[source_year %between% c(1990,1999), .N, keyby = state],"N","b90")
b00 = setnames(data_total[source_year %between% c(2000,2010), .N, keyby = state],"N","b00")
male = data_total[source_year %between% c(1970,2010), .(male=sum(is_male==TRUE)), keyby = state]
female = data_total[source_year %between% c(1970,2010), .(female=sum(is_male==FALSE)), keyby = state]
avg_weight = data_total[source_year %between% c(1970,2010), .(avg_weight_pounds=mean(weight_pounds,na.rm = TRUE)), keyby = state]
avg_weight[, avg_weight_kgs := avg_weight_pounds / conv_factor_pounds_to_kgs, by=state]
avg_weight_kgs = avg_weight[, c('state', 'avg_weight_kgs'), with=FALSE]
但是我一直坚持用竞赛ID代替竞赛文字
我尝试过:
data_total$test <- match(race$race,data_total$child_race)
data_total$test <- race$race[match(race$race,data_total$child_race)]
r70 = data_total[source_year %between% c(1970,1979), .(r70 = length(.N)), keyby = .(state,child_race)]
r70$r70 <- race$race[match(r70$r70,race$id_race)]
r70[race, r70 := race, on = c(r70 = "race")]
我希望输出为:
state r70
AK "japanese"
AL "white"
AR "black"
. .
. .
. .
答案 0 :(得分:1)
我下载了您的两个数据文件,并执行了以下操作:
l=lapply(list.files(path=".", pattern="natalidad*"), data.table::fread , header = T, verbose = F, sep = ',', select = c("source_year", "state", "is_male", "child_race", "weight_pounds"))
data=do.call("rbind", l)
(rbindlist
由于某种原因对我不起作用...)
我必须摆脱child_race
值为NA的行:
data=data[!is.na(data$child_race),]
然后您的数据如下所示:
head(data)
source_year state is_male child_race weight_pounds
1: 1998 AK FALSE 10 4.874421
2: 1969 CA TRUE 5 7.438397
3: 1971 CA TRUE 4 10.562347
4: 1974 CA FALSE 5 4.437905
5: 1981 CA TRUE 4 6.790238
6: 1981 CA TRUE 5 8.968405
然后我将您的race
列表变成了数据框:
race=data.frame(race,stringsAsFactors = F)
race
id_race race
1 1 white
2 2 black
3 3 american indian
4 4 chinese
5 5 japanese
6 6 hawaiian
7 7 filipino
8 9 unknown/other
9 18 asian indian
10 28 korean
11 39 samoan
12 48 vietnamese
然后,为了进行替换,我必须确保data$child_race
,race$id_race
和race$race
的类都兼容
data$child_race=as.character(data$child_race)
race$id_race=as.character(race$id_race)
race$race=as.character(race$race)
然后我只使用了一个for循环:
for(r in race$id_race){
data$child_race[data$child_race==r]<-race$race[race$id_race==r]
}
head(data)
source_year state is_male child_race weight_pounds
1: 1998 AK FALSE 10 4.874421
2: 1969 CA TRUE japanese 7.438397
3: 1971 CA TRUE chinese 10.562347
4: 1974 CA FALSE japanese 4.437905
5: 1981 CA TRUE chinese 6.790238
6: 1981 CA TRUE japanese 8.968405
这是您想要的吗?