我有一个包含字符字段的表,可以使用以下任一模式:
input
97 # a single number
210 foo # a number and a word
87 bar 89 # a number, a word, a number
21 23 # two numbers
123 2 fizzbuzz # two number, a word
12 fizz 34 buzz # a number, a word, a number, a word
我希望将每行最多分为4个部分,分别包含the first number
,the first word
(如果存在),the second number
(如果存在)和the second word
如果它存在所以我的例子会给出:
input nb_1 word_1 nb_2 word_2
97 97
210 foo 210 foo
87 bar 89 87 bar 89
21 23 21 23
123 2 fizzbuzz 123 2 fizzbuzz
12 fizz 34 buzz 12 fizz 34 buzz
请注意two number, a word
(最后一个例子之前的例子)的情况:word_1
中没有任何内容,因为这两个数字之间没有字。
如果没有繁琐的if / if / else结构,有没有办法做到这一点?
如果可以提供帮助,则所有单词都属于10个特定单词的列表。此外,如果有两个单词,它们可以相同或不同。此外,数字可以是一位,两位或三位数。
由于
答案 0 :(得分:1)
以下是使用gsub
包中的cSplit
和splitstackshape
的想法,
library(splitstackshape)
df$num <- gsub('\\D', ' ', df$V1)
df$wrds <- gsub('\\d', ' ', df$V1)
newdf <- cSplit(df, 2:3, ' ', 'wide')
newdf
# V1 num_1 num_2 wrds_1 wrds_2
#1: 97 97 NA NA NA
#2: 210 foo 210 NA foo NA
#3: 87 bar 89 87 89 bar NA
#4: 21 23 21 23 NA NA
#5: 123 2 fizzbuzz 123 2 fizzbuzz NA
#6: 12 fizz 34 buzz 12 34 fizz buzz
唯一的问题是第5行,可以修复如下,
newdf$wrds_1 <- as.character(newdf$wrds_1)
newdf$wrds_2 <- as.character(newdf$wrds_2)
newdf$wrds_2[grep('[0-9]+\\s+[0-9]+\\s+[A-Za-z]', newdf$V1)] <- newdf$wrds_1[grep('[0-9]+\\s+[0-9]+\\s+[A-Za-z]', newdf$V1)]
newdf$wrds_1[grep('[0-9]+\\s+[0-9]+\\s+[A-Za-z]', newdf$V1)] <- NA
最终给出了,
newdf
# V1 num_1 num_2 wrds_1 wrds_2
#1: 97 97 NA NA NA
#2: 210 foo 210 NA foo NA
#3: 87 bar 89 87 89 bar NA
#4: 21 23 21 23 NA NA
#5: 123 2 fizzbuzz 123 2 NA fizzbuzz
#6: 12 fizz 34 buzz 12 34 fizz buzz
数据强>
dput(df)
structure(list(V1 = c("97", " 210 foo", " 87 bar 89",
" 21 23", " 123 2 fizzbuzz",
" 12 fizz 34 buzz")), .Names = "V1", row.names = c(NA,
-6L), class = "data.frame")
答案 1 :(得分:1)
Tried in a different way...
library(splitstackshape)
abc <- data.frame(a=c(97,"210 foo","87 bar 89","21 23","123 2 fizzbuzz","12 fizz 34 buzz"))
abc1 <- data.frame(cSplit(abc, "a", " ", stripWhite = FALSE))
abc <- cbind(abc,abc1)
names(abc) <- c("input","nb_1", "word_1", "nb_2","word_2")
abc[,1:5] <-apply(abc[,1:5] , 2, as.character)
for(i in 1:nrow(abc)){
abc$word_2[i] <- replace(abc$word_2[i] , is.na(abc$word_2[i]),abc$nb_2[grepl("[a-z]",abc$nb_2[i])][i])
abc$nb_2[i] <- replace(abc$nb_2[i] , is.na(abc$nb_2[i])|grepl("[a-z]",abc$nb_2[i]),abc$word_1[grepl("[0-9]",abc$word_1[i])][i])
}
abc$word_1 <- ifelse(grepl("[0-9]",abc$word_1),NA,abc$word_1)
abc[is.na(abc)] <- ""
print(abc)
input nb_1 word_1 nb_2 word_2
1 97 97
2 210 foo 210 foo
3 87 bar 89 87 bar 89
4 21 23 21 23
5 123 2 fizzbuzz 123 2 fizzbuzz
6 12 fizz 34 buzz 12 fizz 34 buzz
答案 2 :(得分:1)
这是一个hacky功能...虽然你可能有其他情况会打破它。
f <- function(x){
string2 <- strsplit(x, " ")[[1]]
if (length(string2) < 2)
return(c(string2, NA, NA, NA))
arenums <- grepl("\\d", string2)
c(string2[which(arenums)[1]],
if (arenums[2]) NA else string2[which(!arenums)[1]],
string2[which(arenums)[2]],
if (arenums[2]) string2[which(!arenums)[1]] else string2[which(!arenums)[2]])
}
> f("97")
[1] "97" NA NA NA
> f("210 foo")
[1] "210" "foo" NA NA
> f("87 bar 89")
[1] "87" "bar" "89" NA
> f("21 23")
[1] "21" NA "23" NA
> f("123 2 fizzbuzz")
[1] "123" NA "2" "fizzbuzz"
> f("12 fizz 34 buzz")
[1] "12" "fizz" "34" "buzz"