我试图在df。
中基于分隔符(“_”)拆分第一列const raw = options && options.rawOutput || true;
第一个分隔符之前的第一个字符串,第一个分隔符之后的第二个字符串和最后一个分隔符之前的最后一个字符串
根据上述说法
在row_1中:根据分隔符和no.of分隔符分割的所有字符串均为5
row_2:分隔符2,3,4缺失,因此值为空,分隔符号为2
row_3:分隔符3缺失,因此值为空,分隔符号为4
缺少row_4:delimiter 2,3,因此值为空,而no.of分隔符为3
我尝试使用以下代码。
country_city_gender_age_name_state =c("US_Dallas_Male_23_hanes_TX","US_LosAngeles_CA",
"US_Atlanta_Female_jenny_GA","US_Orlando_kane_FL")
df = data.frame(country_city_gender_age_name_state)
所需的输出数据框将是
df$country<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",1)
df$city<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",2)
df$gender<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",3)
df$age<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",4)
df$name<- sapply(strsplit(as.character(df$country_city_gender_age_name_state),"_"), "[",5)
df$state<- sapply(strsplit(as.character(df$country_city_gender_age_name_state), "_"), tail, 1)
提前致谢
答案 0 :(得分:0)
你可以尝试这样的事情,虽然可能很难概括。它适用于国家,城市和州永远不会丢失,性别总是&#34;女性&#34;或&#34;男&#34;,字符串中的唯一数字与期望的&#34;年龄&#34;相关。变量。有些线路当然可以在必要的情况下进行调整,并且也许这些线路中的一些可能会稍微缩短一些。无论如何,这有帮助吗?
df = data.frame(country_city_gender_age_name_state, stringsAsFactors = F)
df$country <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), "[[", 1)
df$city <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), "[[", 2)
df$state <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), tail, 1)
df$gender <- ifelse(grepl("Female", df$country_city_gender_age_name_state), "Female", ifelse(grepl("Male", df$country_city_gender_age_name_state), "Male", NA))
df$age <- as.numeric(gsub("\\D", "", df$country_city_gender_age_name_state))
df$name <- sapply(strsplit(df$country_city_gender_age_name_state, "_"), tail, 2)[1,]
df$name <- ifelse(sapply(1:nrow(df), function(x) df[x,"name", drop = FALSE] %in% df[x,2:6]), NA, df$name)
> df[,c(1:3,5:7,4)]
country_city_gender_age_name_state country city gender age name state
1 US_Dallas_Male_23_hanes_TX US Dallas Male 23 hanes TX
2 US_LosAngeles_CA US LosAngeles <NA> NA <NA> CA
3 US_Atlanta_Female_jenny_GA US Atlanta Female NA jenny GA
4 US_Orlando_kane_FL US Orlando <NA> NA kane FL
答案 1 :(得分:0)
我无法提供一个漂亮的解决方案,但这个必须有效。您始终可以找到州,国家和城市的列表,因此只需制作ifelse条件来比较某个值是否在其中一个列表中。对于状态名称,您可以处理2个字母的regEx条件或使用列表。无法在列表中找到或由regEx匹配的值必须是名称。
library(dplyr)
dt <- as.data.table(df)
dt$number_of_entry <- rep(1:nrow(dt))
new_dt <- dt %>% mutate(country_city_gender_age_name_state =
strsplit(as.character(country_city_gender_age_name_state),&#34; _&#34;))%&gt;% UNNEST(country_city_gender_age_name_state)
output <- data.frame(id = 1:nrow(dt),
country = rep(NA, nrow(dt)),
gender = rep(NA, nrow(dt)),
age = rep(NA, nrow(dt)),
name = rep(NA, nrow(dt)),
state = rep(NA, nrow(dt)))
number_of_entry = 1
for (i in nrow(new_dt)) {
while (number_of_entry == new_dt$number_of_entry) {
if (new_dt$country_city_gender_age_name_state[i] %in% list_of_countries) {
output[number_of_entry,]$country = new_dt$country_city_gender_age_name_state[i]}
else {
if (new_dt$country_city_gender_age_name_state[i] %in% c("Female", "Male")) {
output[number_of_entry,]$gender = new_dt$country_city_gender_age_name_state[i]}
else {
if (new_dt$country_city_gender_age_name_state[i] %in% list_of_cities) {
output[number_of_entry,]$city = new_dt$country_city_gender_age_name_state[i]}
else {
if(grepl("\\d*", new_dt$country_city_gender_age_name_state[i])) {
output[number_of_entry,]$age = new_dt$country_city_gender_age_name_state[i]}
else {
if (new_dt$country_city_gender_age_name_state[i] %in% list_of_states) {
output[number_of_entry,]$state = new_dt$country_city_gender_age_name_state[i]}
else {
output[number_of_entry,]$name = new_dt$country_city_gender_age_name_state[i]}
}
}
}
}
}
}
}
}