我在一个文本文件中给出了数据,如下所示:
Measurement: mc
Loop:
var1=0, var2=-5, var3=1.8
values:
iteration data
0 1.203
1 1.206
2 2.206
3 1.201
4 1.204
5 1.204
6 1.204
statistics:
max 1.206
min 1.201
mean 1.204
stddev 0.001
avgdev 0.001
failedtimes 0
Measurement: mc
Loop:
var1=10, var2=-5, var3=1.8
values:
iteration data
0 1.203
1 1.206
2 2.206
3 1.201
statistics:
max 1.206
min 1.201
mean 1.204
stddev 0.001
avgdev 0.001
failedtimes 0
我希望以更正常的格式获取数据,例如:
var1, var2, var3, iteration, data,
0, -5, 1.8, 0, 1.203,
0, -5, 1.8, 1, 1.206,
...
10, -5, 1.8, 0, 1.203,
我在尝试解析这样的数据时遇到了问题。请帮忙
答案 0 :(得分:6)
一种方法是使用一小部分简单正则表达式和readLines
来拉出相关的行。
您的数据
txt <-
"Measurement: mc
Loop:
var1=0, var2=-5, var3=1.8
values:
iteration data
0 1.203
1 1.206
2 2.206
3 1.201
4 1.204
5 1.204
6 1.204
statistics:
max 1.206
min 1.201
mean 1.204
stddev 0.001
avgdev 0.001
failedtimes 0
Measurement: mc
Loop:
var1=10, var2=-5, var3=1.8
values:
iteration data
0 1.203
1 1.206
2 2.206
3 1.201
statistics:
max 1.206
min 1.201
mean 1.204
stddev 0.001
avgdev 0.001"
# Read in : you can pass the file path instead of textConnection
r = readLines(textConnection(txt))
# Find indices of relevant parts of string that you want to keep
id1 = grep("var", r)
id2 = grep("iteration", r)
id3 = grep("statistics", r)
# indices for iteration data
m = mapply( seq, id2, id3-1)
# Use read.table to parse the relevant rows
lst <- lapply(seq_along(m), function(x)
cbind(read.table(text=r[id1][x], sep=","), #var data
read.table(text=r[m[[x]]], header=TRUE))) # iteration data
dat <- do.call(rbind, lst)
# Remove the var= text and convert to numeric
dat[] <- lapply(dat, function(x) as.numeric(gsub("var\\d+=", "", x)))
dat
# V1 V2 V3 iteration data
# 1 0 -5 1.8 0 1.203
# 2 0 -5 1.8 1 1.206
# 3 0 -5 1.8 2 2.206
# 4 0 -5 1.8 3 1.201
# 5 0 -5 1.8 4 1.204
# 6 0 -5 1.8 5 1.204
# 7 0 -5 1.8 6 1.204
# 8 10 -5 1.8 0 1.203
# 9 10 -5 1.8 1 1.206
# 10 10 -5 1.8 2 2.206
# 11 10 -5 1.8 3 1.201
将数据拆分为多个部分,然后应用函数即
,实际上可能会更清楚一些sp <- split(r, cumsum(grepl("measure", r, TRUE)))
# Function to parse
fun <- function(x){
id1 = grep("var", x)
id2 = grep("iteration", x)
id3 = grep("statistics", x)
m = seq(id2, id3-1)
cbind(read.table(text=x[id1], sep=","),
read.table(text=x[m], header=TRUE))
}
lst <- lapply(sp, fun)
然后像以前一样继续
答案 1 :(得分:3)
这是一个读取它并处理它的管道。根据最后的注释,假设数据在L
中。您可能需要使用L <- readLines("myfile.dat")
之类的内容创建此内容。
使用trimws
修剪前导空格和尾随空格 - 可能不需要此步骤,但只要数据在行的开头有空格,它就不会受到伤害。然后grep
输出以数字开头或包含var
的行,用空格替换v
,a
,r
,=
并用换行符替换逗号。这使得它以read.table
可以将其读入2列数据框的形式,其中第一列为1,2,3,后跟迭代编号,第二列为var1
的值,var2
,var3
和data
都为每个组重复。我们通过使用表达式cumsum(...) %/% 2
识别顺序运行来形成分组变量。这假设每组至少有2次迭代(0和1)。 (从显示的数据可以看出这种情况,但如果没有,可以通过后面显示的附加代码解决。)最后,通过分组表达式进行拆分,并将每个这样的拆分组重新编写到所需的数据框中。
library(purrr)
L %>%
trimws %>%
grep(pattern = "^\\d|var", value = TRUE) %>%
chartr(old = "var=,", new = " \n") %>%
read.table(text = .) %>%
split(cumsum(c(FALSE, diff(.$V1) != 1)) %/% 2) %>%
map_df(function(x) data.frame(var1 = x[1, 2], var2 = x[2, 2],
var3 = x[3, 2],iteration = x[-(1:3), 1], data = x[-(1:3), 2]))
,并提供:
var1 var2 var3 iteration data
1 0 -5 1.8 0 1.203
2 0 -5 1.8 1 1.206
3 0 -5 1.8 2 2.206
4 0 -5 1.8 3 1.201
5 0 -5 1.8 4 1.204
6 0 -5 1.8 5 1.204
7 0 -5 1.8 6 1.204
8 10 -5 1.8 0 1.203
9 10 -5 1.8 1 1.206
10 10 -5 1.8 2 2.206
11 10 -5 1.8 3 1.201
变体这种代码变体还处理只有一次迭代即迭代0的情况,并以更多代码行为代价简化分组计算。这里的两个-9999实例可以是数据中没有出现的任何数字。
L %>%
grep(pattern = "^\\s*\\d|var", value = TRUE) %>%
sub(pattern = "var", replacement = "-9999 var") %>%
gsub(pattern = "[^0-9.,-]", replacement = " ") %>%
gsub(pattern = ",", replacement = "\n") %>%
strsplit("\\s+") %>%
unlist %>%
as.numeric %>%
split(cumsum(. == -9999)) %>%
map_df(function(x) {
x <- t(matrix(x[-1], 2))
data.frame(var1 = x[1, 2], var2 = x[2, 2], var3 = x[3, 2],
iteration = x[-(1:3), 1], data = x[-(1:3), 2])
})
dplyr / tidyr 我们可以交替使用dplyr和tidyr包。 vars
有3列var1
,var2
和var3
,每组有一行。 values
有一列包含迭代和数据的嵌套两列数据框,每组有一行,但每行包含许多行的数据框。
library(tidyr)
library(dplyr)
vars <- L %>%
grep(pattern = "var", value = TRUE) %>%
gsub(pattern = "[=,]", replacement = " ") %>%
read.table(text = ., col.names = c(NA, "var1", NA, "var2", NA, "var3")) %>%
select(var1, var2, var3)
values <- L %>%
trimws %>%
grep(pattern = "^\\d", value = TRUE) %>%
read.table(text = ., col.names = c("iteration", "data")) %>%
mutate(g = cumsum(iteration == 0)) %>%
nest(-g) %>%
select(-g)
cbind(vars, values) %>% unnest
注意:强>
Lines <- "Measurement: mc
Loop:
var1=0, var2=-5, var3=1.8
values:
iteration data
0 1.203
1 1.206
2 2.206
3 1.201
4 1.204
5 1.204
6 1.204
statistics:
max 1.206
min 1.201
mean 1.204
stddev 0.001
avgdev 0.001
failedtimes 0
Measurement: mc
Loop:
var1=10, var2=-5, var3=1.8
values:
iteration data
0 1.203
1 1.206
2 2.206
3 1.201
statistics:
max 1.206
min 1.201
mean 1.204
stddev 0.001
avgdev 0.001
failedtimes 0"
L <- readLines(textConnection(Lines))