Question

我的df看起来像这样：

. <- c("AXX 101", "", "Introduction to AXX", " ", "Prereq: BXX102, BXX101, not open to CXX program",
       "Antireq: BXX103", "", "AXX 102","AXX Part II", "", "Antireq: BXX101", "", " ")
df <- data.frame(.)
df
                                                 .
1                                          AXX 101
2                                                 
3                              Introduction to AXX
4                                                 
5  Prereq: BXX102, BXX101, not open to CXX program
6                                  Antireq: BXX103
7                                                 
8                                          AXX 102
9                                      AXX Part II
10                                                
11                                 Antireq: BXX101
12                                                
13

我想用数据框的这个可悲的借口解析这样的事情：

title    prereq                                   antireq 
AXX101   BXX102, BXX101, not open to CXX program  BXX103
AXX102                                            BXX101

Answer 1

在每个AXX <number>行前加\nTitle:，选择带冒号的行，并使用read.dcf读取结果。如果每个列名的第一个字母大写，则可以省略标记为##的行。不需要包裹：

s <- as.character(df[[1]])

ix <- grep("AXX \\d", s)
s[ix] <- paste("\nTitle:", s[ix])
s <- grep(":", s, value = TRUE)

out <- read.dcf(textConnection(s))
colnames(out) <- tolower(colnames(out)) ##

，并提供：

> out
     title     prereq                                    antireq 
[1,] "AXX 101" "BXX102, BXX101, not open to CXX program" "BXX103"
[2,] "AXX 102" NA                                        "BXX101"

更新：一些简化。

Answer 2

一个选项是，（此处我将.替换为V1作为列名称

res <-  do.call(rbind,
          lapply(split(df,cumsum(grepl('AXX \\d+', df$V1))), function(x) {
         x1 <- x$V1[grep('^(AXX \\d+|Prereq|Antireq)', x$V1)]
         x2 <- sub(':? .*', '', x1)
         x3 <- sub('.*: ', '', x1[match(c('AXX', 'Prereq', 'Antireq'), x2)], perl=TRUE)
        data.frame(title=x3[1], prereq=x3[2], antireq=x3[3])}))
res 
#    title                                  prereq antireq
#1 AXX 101 BXX102, BXX101, not open to CXX program  BXX103
#2 AXX 102                                    <NA>  BXX101

Answer 3

也许您可以尝试以下内容：

library(splitstackshape)
library(dplyr)
library(zoo)
library(tidyr)

cSplit(df, "V1", ":") %>%
  .[, V2 := ifelse(grepl("[0-9]$", V1_1), as.character(V1_1), NA)] %>%
  .[, V2 := na.locf(V2)] %>%
  .[V1_1 %in% c("Prereq", "Antireq")] %>%
  spread(V1_1, V1_2)
#         V2 Antireq                                  Prereq
# 1: AXX 101  BXX103 BXX102, BXX101, not open to CXX program
# 2: AXX 102  BXX101                                      NA

解析混乱的数据帧以重塑数据

3 个答案: