我的df看起来像这样:
. <- c("AXX 101", "", "Introduction to AXX", " ", "Prereq: BXX102, BXX101, not open to CXX program",
"Antireq: BXX103", "", "AXX 102","AXX Part II", "", "Antireq: BXX101", "", " ")
df <- data.frame(.)
df
.
1 AXX 101
2
3 Introduction to AXX
4
5 Prereq: BXX102, BXX101, not open to CXX program
6 Antireq: BXX103
7
8 AXX 102
9 AXX Part II
10
11 Antireq: BXX101
12
13
我想用数据框的这个可悲的借口解析这样的事情:
title prereq antireq
AXX101 BXX102, BXX101, not open to CXX program BXX103
AXX102 BXX101
答案 0 :(得分:2)
在每个AXX <number>
行前加\nTitle:
,选择带冒号的行,并使用read.dcf
读取结果。如果每个列名的第一个字母大写,则可以省略标记为##
的行。不需要包裹:
s <- as.character(df[[1]])
ix <- grep("AXX \\d", s)
s[ix] <- paste("\nTitle:", s[ix])
s <- grep(":", s, value = TRUE)
out <- read.dcf(textConnection(s))
colnames(out) <- tolower(colnames(out)) ##
,并提供:
> out
title prereq antireq
[1,] "AXX 101" "BXX102, BXX101, not open to CXX program" "BXX103"
[2,] "AXX 102" NA "BXX101"
更新:一些简化。
答案 1 :(得分:0)
一个选项是,(此处我将.
替换为V1
作为列名称
res <- do.call(rbind,
lapply(split(df,cumsum(grepl('AXX \\d+', df$V1))), function(x) {
x1 <- x$V1[grep('^(AXX \\d+|Prereq|Antireq)', x$V1)]
x2 <- sub(':? .*', '', x1)
x3 <- sub('.*: ', '', x1[match(c('AXX', 'Prereq', 'Antireq'), x2)], perl=TRUE)
data.frame(title=x3[1], prereq=x3[2], antireq=x3[3])}))
res
# title prereq antireq
#1 AXX 101 BXX102, BXX101, not open to CXX program BXX103
#2 AXX 102 <NA> BXX101
答案 2 :(得分:0)
也许您可以尝试以下内容:
library(splitstackshape)
library(dplyr)
library(zoo)
library(tidyr)
cSplit(df, "V1", ":") %>%
.[, V2 := ifelse(grepl("[0-9]$", V1_1), as.character(V1_1), NA)] %>%
.[, V2 := na.locf(V2)] %>%
.[V1_1 %in% c("Prereq", "Antireq")] %>%
spread(V1_1, V1_2)
# V2 Antireq Prereq
# 1: AXX 101 BXX103 BXX102, BXX101, not open to CXX program
# 2: AXX 102 BXX101 NA