一点背景:犯罪可能涉及一支以上的枪支,而参与的参与者也可能多于一名。因此,这些列包含每个枪支/参与者的信息,并用“ ||”分隔。 0 :, 1:...表示该特定枪支/参与者的详细信息。
我的目标是捕获每一列中的唯一实例,而不理会0:,1:,2 ......
df= read.csv("C:/Users/rmahesh/Desktop/gun-violence-data_01-2013_03-2018.csv")
df$incident_id = NULL
df$incident_url = NULL
df$source_url = NULL
df$participant_name = NULL
df$participant_relationship = NULL
df$sources = NULL
df$incident_url_fields_missing = NULL
df$participant_status = NULL
df$participant_age_group = NULL
df$participant_type = NULL
df$incident_characteristics = NULL
#Subset of columns with formatting issues:
df2 = df[, c('gun_stolen', 'gun_type', 'participant_age', 'participant_gender')]
0::Unknown||1::Unknown, 0::Unknown||1::Unknown, 0::25||1::31||2::33||3::34||4::33, 0::Male||1::Male||2::Male||3::Male||4::Male
0::Unknown||1::Unknown,0::22 LR||1::223 Rem [AR-15],0::51||1::40||2::9||3::5||4::2||5::15,0::Male||1::Female||2::Male||3::Female||4::Female||5::Male
答案 0 :(得分:2)
gun_type=c("", "0::Unknown||1::Unknown", "",
"0::Handgun||1::Handgun", ""), stringsAsFactors=FALSE)
df$first<-sapply(strsplit(df$gun_type, "\\|\\|"), '[', 1)
splitType<-strsplit(df$gun_type, "\\|\\|")
df.2<-df[rep(1:nrow(df), sapply(splitType, length)),]
splitTypeUnique<-sapply(splitType, unique)
df.2<-df[rep(1:nrow(df), sapply(splitTypeUnique, length)),]
答案 1 :(得分:2)
myvars <- c('gun_stolen', 'gun_type', 'participant_age', 'participant_gender')
res <- as_tibble(df2) %>%
tibble::rowid_to_column() %>%
# Split strings in selected columns at "||". This turns those columns in
# list-columns of character vectors
mutate_at(myvars, str_split, pattern = fixed("||")) %>%
# Go from wide to long format: in the new 'key' column are the original column
# names, and 'value' is the one list-column of character vectors
gather(key, value, one_of(myvars)) %>%
# unnest turns the 'value' list-column into a regular character column, with
# duplication of rows that contain a 'value' of length greater than 1
unnest(value) %>%
filter(value != "") %>%
# Remove the "x::" prefixes
mutate(value = str_split_fixed(value, fixed("::"), n = 2)[, 2]) %>%
# Deduplicate
distinct() %>%
arrange(rowid, key, value)
# # A tibble: 732,017 x 3
# rowid key value
# <int> <chr> <chr>
# 1 1 participant_age 20
# 2 1 participant_gender Female
# 3 1 participant_gender Male
# 4 2 participant_age 20
# 5 2 participant_gender Male
# 6 3 gun_stolen Unknown
# 7 3 gun_type Unknown
# 8 3 participant_age 25
# 9 3 participant_age 31
# 10 3 participant_age 33
# # ... with 732,007 more rows
还扩展了@Ben G的评论:
res %>%
count(key, value) %>%
arrange(key, desc(n))
# # A tibble: 141 x 3
# key value n
# <chr> <chr> <int>
# 1 gun_stolen Unknown 132099
# 2 gun_stolen Stolen 7350
# 3 gun_stolen Not-stolen 1560
# 4 gun_stolen "" 355
# 5 gun_type Unknown 98892
# 6 gun_type Handgun 17609
# 7 gun_type 9mm 6040
# 8 gun_type Shotgun 3560
# 9 gun_type Rifle 3196
# 10 gun_type 22 LR 3093
# 11 gun_type 40 SW 2624
# 12 gun_type 380 Auto 2323
# 13 gun_type 45 Auto 2234
# 14 gun_type 38 Spl 1758
# 15 gun_type 223 Rem [AR-15] 1248
# 16 gun_type 12 gauge 975
# 17 gun_type Other 892
# 18 gun_type 7.62 [AK-47] 854
# 19 gun_type 357 Mag 800
# 20 gun_type 25 Auto 601
# 21 gun_type 32 Auto 481
# 22 gun_type "" 356
# 23 gun_type 20 gauge 194
# 24 gun_type 44 Mag 192
# 25 gun_type 30-30 Win 105
# 26 gun_type 410 gauge 96
# 27 gun_type 308 Win 88
# 28 gun_type 30-06 Spr 71
# 29 gun_type 10mm 50
# 30 gun_type 16 gauge 30
# 31 gun_type 300 Win 23
# 32 gun_type 28 gauge 6
# 33 participant_age 19 10541
# 34 participant_age 20 9919
# 35 participant_age 18 9826
# 36 participant_age 21 9795
# 37 participant_age 22 9642
# 38 participant_age 23 9383
# 39 participant_age 24 9204
# 40 participant_age 25 8562
# 41 participant_age 26 7815
# 42 participant_age 17 7416
# 43 participant_age 27 7228
# 44 participant_age 28 6528
# 45 participant_age 29 6055
# 46 participant_age 30 5652
# 47 participant_age 31 5145
# 48 participant_age 32 5039
# 49 participant_age 16 4977
# 50 participant_age 33 4662
# # ... with 91 more rows