我有一个包含长段落的字符变量数据框,我需要在某些短语确定的位置拆分。然而问题是,在许多情况下,这些短语与前面的单词合并。
这是我正在做的事情:
data <- readLines(n=2)
= DAY 1 CHALLENGES = syndicated.= DAY 2 CHALLENGES = Red Sea.= DAY 3 CHALLENGES = framework.= DAY 4 CHALLENGES = Did ;-)= DAY 5 CHALLENGES = Paste ...= DAY 6 CHALLENGES = Name
= DAY 1 CHALLENGES = very high.= DAY 2 CHALLENGES = Rank understand.= DAY 3 CHALLENGES = buy....= DAY 4 CHALLENGES = result.= DAY 5 CHALLENGES = coffee.= DAY 6 CHALLENGES = Bla.
df <- as.data.frame(data)
delim <- c("= DAY 1 CHALLENGES = ",
"= DAY 2 CHALLENGES = ",
"= DAY 3 CHALLENGES = ",
"= DAY 4 CHALLENGES = ",
"= DAY 5 CHALLENGES = ",
"= DAY 6 CHALLENGES = ")
y <- data.frame(do.call('rbind',
strsplit(as.character(df$data), delim, fixed = FALSE)))
y
X1
1
2 = DAY 1 CHALLENGES = very high.
X2
1 syndicated.= DAY 2 CHALLENGES = Red Sea.= DAY 3 CHALLENGES = framework.= DAY 4 CHALLENGES = Did ;-)= DAY 5 CHALLENGES = Paste ...= DAY 6 CHALLENGES = Name
2 Rank understand.= DAY 3 CHALLENGES = buy....= DAY 4 CHALLENGES = result.= DAY 5 CHALLENGES = coffee.= DAY 6 CHALLENGES = Bla.
我想得到每个= DAY x CHALLENGES =带有文本的段,直到下一个这样的段作为单独的变量。
谢谢!
使用提议的方法进行更新:
> a <- scan(file ="~/Desktop/alm/a.txt", what="")
Read 1 item
> a
[1] "= DAY 1 CHALLENGES = very high.= DAY 2 CHALLENGES = Rank understand.= DAY 3 CHALLENGES = buy....= DAY 4 CHALLENGES = result. = DAY 5 CHALLENGES = Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc.com/ DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5 = DAY 6 CHALLENGES = Bla."
> b <- scan(file ="~/Desktop/alm/b.txt", what="")
Read 1 item
> b
[1] "= DAY 1 CHALLENGES = very high.= DAY 2 CHALLENGES = Rank understand.= DAY 3 CHALLENGES = buy....= DAY 4 CHALLENGES = result. Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc.com/ DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5 ?= DAY 6 CHALLENGES = Bla."
> c <- c(a,b)
> df <- as.data.frame(c)
> lst <- strsplit(gsub(" (?=\\= DAY)", ".", c, perl=TRUE),
+ '(?<=[.)])(?=\\=)', perl=TRUE)
> out <- do.call(cbind, lapply(lst, function(x) sub('^=.*= ', '', x)))
Warning message:
In (function (..., deparse.level = 1) :
number of rows of result is not a multiple of vector length (arg 2)
> out
[,1]
[1,] "very high."
[2,] "Rank understand."
[3,] "buy...."
[4,] "result.."
[5,] "Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc.com/ DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5."
[6,] "Bla."
[,2]
[1,] "very high."
[2,] "Rank understand."
[3,] "buy...."
[4,] "Bla." #this is not the value from the input file
[5,] "very high." #this is missing in the input file, yet a value is getting output
[6,] "Rank understand." #incorrect recognition of ?= DAY 6 CHALLENGES =; the same happens with := and != or similar
评论中指出了问题。 缺少值的指示将是有用的,而不是插入随机的。
答案 0 :(得分:0)
可能有帮助
library(stringr)
str_extract_all(df$data, '= [A-Za-z]+ \\d+ [A-Za-z]+ = [A-Za-z ]+(\\.+| ;-\\)| \\.+| +)')
#[[1]]
#[1] "= DAY 1 CHALLENGES = syndicated." "= DAY 2 CHALLENGES = Red Sea."
#[3] "= DAY 3 CHALLENGES = framework." "= DAY 4 CHALLENGES = Did ;-)"
#[5] "= DAY 5 CHALLENGES = Paste ..." "= DAY 6 CHALLENGES = Name "
#[[2]]
#[1] "= DAY 1 CHALLENGES = very high."
#[2] "= DAY 2 CHALLENGES = Rank understand."
#[3] "= DAY 3 CHALLENGES = buy...."
#[4] "= DAY 4 CHALLENGES = result."
#[5] "= DAY 5 CHALLENGES = coffee."
#[6] "= DAY 6 CHALLENGES = Bla."
或使用strsplit
lst <- strsplit(as.character(df$data), '(?<=[.)])(?=\\=)', perl=TRUE)
lst
#[[1]]
#[1] "= DAY 1 CHALLENGES = syndicated." "= DAY 2 CHALLENGES = Red Sea."
#[3] "= DAY 3 CHALLENGES = framework." "= DAY 4 CHALLENGES = Did ;-)"
#[5] "= DAY 5 CHALLENGES = Paste ..." "= DAY 6 CHALLENGES = Name "
#[[2]]
#[1] "= DAY 1 CHALLENGES = very high."
#[2] "= DAY 2 CHALLENGES = Rank understand."
#[3] "= DAY 3 CHALLENGES = buy...."
#[4] "= DAY 4 CHALLENGES = result."
#[5] "= DAY 5 CHALLENGES = coffee."
#[6] "= DAY 6 CHALLENGES = Bla."
如果要提取字符串syndicated.
,very high.
等等。
do.call(cbind, lapply(lst, function(x) sub('^=.*= ', '', x)))
# [,1] [,2]
#[1,] "syndicated." "very high."
#[2,] "Red Sea." "Rank understand."
#[3,] "framework." "buy...."
#[4,] "Did ;-)" "result."
#[5,] "Paste ..." "coffee."
#[6,] "Name " "Bla."
基于更新后的字符串“a”
lst <- strsplit(gsub(" (?=\\= DAY)", ".", a, perl=TRUE),
'(?<=[.)])(?=\\=)', perl=TRUE)
out <- do.call(cbind, lapply(lst, function(x) sub('^=.*= ', '', x)))
out[,1]
#[1] "very high."
#[2] "Rank understand."
#[3] "buy...."
#[4] "result.."
#[5] "Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc.com/ DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5."
#[6] "Bla."
我再次尝试c
(将对象名称更改为c1
,因为c
是R中的函数
c1 <- c(a,b)
c2 <- gsub("( |\\?)(?=\\= DAY)|\\.com. (?=DAY)", " .", c1, perl=TRUE)
lst <- strsplit(c2, '(?<=[.)])(?=(\\=|DAY))', perl=TRUE)
lst2 <- lapply(lst, function(x) unname(unlist(tapply(x,
gsub('.*?DAY (\\d+).*', '\\1', x), FUN=paste, collapse= ' '))))
out <- do.call(cbind,lapply(lst2, function(x)
sub('^=[^=:]+(\\=|:) ', '', sub('^(?=DAY)', '= ', x, perl=TRUE))))
out[,1]
#[1] "very high."
#[2] "Rank understand."
#[3] "buy...."
#[4] "result. ."
#[5] "Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc . DAY 5 CHALLENGE: Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5 ."
#[6] "Bla."
out[,2]
#[1] "very high."
#[2] "Rank understand."
#[3] "buy...."
#[4] "result. Paste the link(s) that you think is Paid Media.http://lebron11.nikeinc ."
#[5] "Paste the link(s) that you think is Owned Media.http://www.nike.com/ ; https://www.pinterest.com/nikewomen DAY 5 CHALLENGE: Paste the link(s) that you think is BONUS QUESTION DAY 5 ."
#[6] "Bla."