Question

我对R中的正则表达式有几个疑问，这些问题与我先前在堆栈溢出中的one相关，但范围更大。

我在文本文件中有代表统计信息的行。我将这些文件加载到R中，并希望获取其中一些统计信息的值。

第一种情况，此类型的统计信息：

system.cpu.dcache.overall_accesses::.cpu.data     42519477                       # number of overall (read+write) accesses
system.l2.overall_accesses::.cpu.data         1335898                       # number of overall (read+write) accesses
system.l3.overall_accesses::.cpu.data         1331502                       # number of overall (read+write) accesses

在这种情况下，我想将高速缓存级别捕获为字符串（在“ system。”和。“ overall_accesses”之间），并将值捕获为空白之间的整数。

cache_level =“ cpu.dcache”或“ l2”或“ l3” 值=“ 42519477”等。

第二种情况：

system.l3.compressor.compression_size::256 58740 # Number of blocks that compressed to fit in 256 bits"
system.l3.compressor.compression_size::256 65742 # Number of blocks that compressed to fit in 512 bits"

在这种情况下，我想以字符串形式捕获缓存级别，以整数形式捕获值，以及压缩大小（即256或512）。压缩大小始终为数字。

compression_size =“ 256”或“ 512”

第三种情况：

system.l2.compressor.encoding::Base4_1 87521 # Number of data entries that match encoding Base4_1
system.l2.compressor.encoding::Base8_1 58731 # Number of data entries that match encoding Base8_1
system.l2.compressor.encoding::Uncompressed 24125 # Number of data entries that match encoding Uncompressed

这种情况类似于第二种情况，因为我想得到相同的东西，但是编码是字符串。

compression_encoding =“ Base4_1”或“ Base8_1”或“未压缩”

为了获得清晰的线条，我想到了这样的东西：

For first: if (grepl("system.+\\.*.overall_accesses::.cpu.data", line))
For second: if (grepl("system.\\.*.compressor.compression_size::\\d+", line))
For third: if (grepl("system.\\.*.compressor.encoding::\\.*", line))

我不确定这些是否会起作用。然后我需要获取不同的数据。

谢谢。

Answer 1

我不确定您要输出哪种格式，但这是使用stringr的建议。

library(stringr)

text <- readLines("/path/to/your/file")

cache_level <-
  text %>%
  str_subset("^system\\..*\\.overall_accesses") %>% 
  str_replace_all("^system\\.(.*)\\.overall_accesses.*", "\\1")

value <- text %>%
  str_subset("^system\\.(.*)\\.overall_accesses") %>% 
  str_replace_all(".*\\s+(\\d+)\\s+.*", "\\1") %>%
  as.numeric()

compression_size <- text %>%
  str_subset(".*compression_size.*") %>% 
  str_replace_all(".*compression_size::(\\d+)\\s+.*", "\\1") %>%
  as.numeric()

compression_encoding <-
  text %>%
  str_subset(".*encoding.*") %>% 
  str_replace_all(".*encoding::(\\w+)\\s+.*", "\\1")

输出：

> cache_level
[1] "cpu.dcache" "l2"         "l3"        
> value
[1] 42519477  1335898  1331502
> compression_size
[1] 256 256
> compression_encoding
[1] "Base4_1"      "Base8_1"      "Uncompressed"

Answer 2

我开发了 unglue 来解决类似的用例，请参见：

# install.packages("unglue")
library(unglue)
x <- c(
"system.cpu.dcache.overall_accesses::.cpu.data     42519477",
"system.l2.overall_accesses::.cpu.data         1335898     ",
"system.l3.overall_accesses::.cpu.data         1331502     ",
"system.l3.compressor.compression_size::256 58740",
"system.l3.compressor.compression_size::256 65742",
"system.l2.compressor.encoding::Base4_1 87521",
"system.l2.compressor.encoding::Base8_1 58731",
"system.l2.compressor.encoding::Uncompressed 24125")

patterns <- c(
  "system.{cache_level}.overall_accesses{=[^0-9]*}{value=\\d+}{=.*?}",
  "system.{cache_level}.compressor.compression_size::{compression_size} {value=\\d+}{=.*?}",
  "system.{cache_level}.compressor.encoding::{encoding} {value=\\d+}{=.*?}")

unglue_data(x, patterns)
#>   cache_level    value compression_size     encoding
#> 1  cpu.dcache 42519477             <NA>         <NA>
#> 2          l2  1335898             <NA>         <NA>
#> 3          l3  1331502             <NA>         <NA>
#> 4          l3    58740              256         <NA>
#> 5          l3    65742              256         <NA>
#> 6          l2    87521             <NA>      Base4_1
#> 7          l2    58731             <NA>      Base8_1
#> 8          l2    24125             <NA> Uncompressed

^{由reprex package（v0.3.0）于2019-11-06创建}

一个接一个地尝试模式，使用第一个匹配的模式
当{subpattern}不包含=时，它匹配任何内容（等同于.*?）
当它以=开头但没有左侧时，不会提取与子模式匹配的值

在此处详细了解：https://github.com/moodymudskipper/unglue

Answer 3

我基本上已经弄清楚了。不过，这并不是最优雅的方式，这就是我在这里希望得到的答案。

第一种情况：

if (grepl("system.+\\.*.overall_accesses::.cpu.data", line)) {
  stat_line <- regmatches(line, gregexpr("system..*.overall_accesses::.cpu.data", line))
  cache_name <- gsub('system.\\s*|\\s*.overall.*$', '', stat_line)
  if (cache_name == "cpu.dcache") cache_name <- "dcache"
  stat <- "overall_accesses_data"
  value <- gsub(".*\\s+(\\d+)\\s+#.*", "\\1", line)
}

第二种情况：

if (grepl("system.+\\.*.compressor.compression_size::\\d+", line)) {
  stat_line <- regmatches(line, gregexpr("system.+\\.*.compressor.compression_size::\\d+", line))
  cache_name <- gsub('system.\\s*|\\s*.compressor.*$', '', stat_line)
  stat <- paste("compression_size", gsub('.*::\\s*|\\s*', '', stat_line), sep="_")
  value <- gsub(".*\\s+(\\d+)\\s+#.*", "\\1", line)
}

第三种情况：

if (grepl("system.l\\d.compressor.encoding::", line)) {
  stat_line <- regmatches(line, gregexpr("system.l\\d.compressor.encoding::.*\\s+", line))
  cache_name <- gsub('system.\\s*|\\s*.compressor.*$', '', stat_line)
  cache_level[length(cache_level)+1] <- cache_name
  stat[length(stat)+1] <- paste("encoding", gsub('.*encoding::(\\w+)\\s+.*', '\\1', stat_line), sep="_")
  value[length(value)+1] <- gsub(".*\\s+(\\d+)\\s+#.*", "\\1", line)
}

并且为了使答案按照标题更通用：

在字符串之间捕获文本：（缓存名称的大小写）

gsub('system.\\s*|\\s*.overall.*$', '', stat_line)

在字符串之间获取数字：（统计值的情况）

gsub(".*\\s+(\\d+)\\s+#.*", "\\1", line)

在字符串后抓取数字：（压缩大小的情况下）

gsub('.*::\\s*|\\s*', '', stat_line)

在字符串后捕获文本或文本和数字的混合：（编码情况）

gsub('.*encoding::(\\w+)\\s+.*', '\\1', stat_line)

感谢prosoitos帮助处理最后一个案例。

R中的正则表达式-字符串之间的文本，字符串和数字之间的数字以及字符串后的文本

3 个答案: