我有一个从SAS
导出的宏文件我要解析以便使用R和markdown
构建文档(由于安全限制,我无法使用现有的外部软件)工作)。
具体来说我想提取:
不幸的是,我缺乏正则表达式技能再次伤害了我,尽管我并不认为这些规则很复杂。
请参阅下面的示例和预期输出:
my_text <- "
%macro macro_name_1
/*----------------------------------------------------------------------------------
optional macro description on one or several lines.
this section always starts with slash star dashes and ends with dashes star slash
and it never contains these combinations of characters in the text
----------------------------------------------------------------------------------*/
(param1 /* optional description of param1 */
,param2
,param3 /* optional description of param3 */
);
/* USES:
some info on one or several lines,
always starts with 'slash star USES:'
and ends with 'star slash'
but doesn't contain these combinations of characters
*/
/* EXAMPLES:
some examples on one or several lines,
always starts with 'slash star EXAMPLES:' OR 'slash star EXAMPLE:'
and ends with 'star slash'
but doesn't contain these combinations of characters
*/
some code on one or several lines,
always after USES and EXAMPLE(S) sections
that may or not contain combinations of /* and */
%mend;
some text outside of a macro-mend pattern, which I wish to ignore
%macro macro_name_2
/*---------------------
desc of macro_name_2
---------------------*/
(x
,y /* desc of y*/
);
/* USES: something */
/* EXAMPLE:
example for macro_name2
*/
code2
%mend;
some more irrelevant text
%macro macro_name_3;
code3
%mend;
"
输出不必与我在此提出的相同,但应至少具有相似的结构(文本缩写为可读性):
expected_output <- tibble::tribble(
~'macro_name', ~'description', ~'parameters', ~'uses', ~'examples', ~'code',
"macro_name_1", "optional macro...", list(param1="optional desc...",
param2="",
param3="optional desc..."), "Some info...", "some examples...", "some code...",
"macro_name_2", "desc of macro_name_2", list(x="", y="desc of y"), "something", "example for...", "code2",
"macro_name_3", "", list(), "", "", "code3")
# # A tibble: 3 x 6
# macro_name description parameters uses examples code
# <chr> <chr> <list> <chr> <chr> <chr>
# 1 macro_name_1 optional macro... <list [3]> Some info... some examples... some code...
# 2 macro_name_2 desc of macro_name_2 <list [2]> something example for... code2
# 3 macro_name_3 <list [0]> code3
答案 0 :(得分:2)
我相信这会让你非常接近你想要的东西。我没有花时间将它放入tibble
,但我怀疑你可以根据自己的喜好找出如何安排这些组件。
它严重依赖于this answer。它只使用stringr
中的一个函数 - str_extract_fixed
对于获取参数及其描述非常方便。
library(stringr)
# Make one character string per macro
macro <- gsub("\\n", " ", my_text)
macro <- regmatches(macro, gregexpr("(?=%macro).*?(?<=%mend)", macro, perl=TRUE))[[1]]
# MACRO NAME --------------------------------------------------------
macro_name <-
unlist(
regmatches(macro, gregexpr("(?<=%macro ).*?(?= )", macro, perl = TRUE))
)
macro_name <-
sub(";", "", macro_name)
# MACRO DESCRIPTION -------------------------------------------------
macro_desc <-
regmatches(macro, gregexpr("(?=/[*]).*?(?<=[*]/)", macro, perl = TRUE))
macro_desc <- vapply(macro_desc,
function(x) if (length(x)) x[1] else "",
character(1))
macro_desc <- gsub("(/[*][-]+|[-]+[*]/)", "", macro_desc)
macro_desc <- trimws(macro_desc)
# MACRO PARAMETERS --------------------------------------------------
param <- regmatches(macro, gregexpr("(?=\\().*?(?<=\\);)", macro, perl = TRUE))
param <-
vapply(param,
function(x) if (length(x)) gsub("(\\(|\\)|;)", "", x) else "",
character(1))
param <- strsplit(param, ",")
param <-
lapply(param,
str_split_fixed,
" ",
n = 2)
param <-
lapply(param,
function(x) trimws(gsub("(/[*]|[*]/)", "", x)))
# Clean out everything up to the end of the parameters
# This might be problematic if the combination of ');' appears
# before the end of the parameters definition
macro <- trimws(sub("^\\%macro.*?;", "", macro))
# MACRO USES --------------------------------------------------------
# Just in case there are multiple spaces between /* and USES, coerce it to
# only one space.
macro <- sub("/[*] +USES", "/* USES", macro)
uses <- regmatches(macro, gregexpr("(?<=/[*] USES[:]).*?(?=[*]/)", macro, perl = TRUE))
uses <- vapply(uses,
function(x) if (length(x)) trimws(x) else "",
character(1))
# Clean out everything up to the end of the USES
macro <- trimws(sub(".+(?<=/[*] USES[:]).*?(?<=[*]/)", "", macro, perl = TRUE))
# MACRO EXAMPLES ----------------------------------------------------
# Just in case there are multiple spaces between /* and EXAMPLES, coerce it to
# only one space.
macro <- sub("/[*] +EXAMPLE", "/* EXAMPLE", macro)
examples <- regmatches(macro, gregexpr("(?<=/[*] EXAMPLE).*?(?=[*]/)", macro, perl = TRUE))
examples <- vapply(examples,
function(x) if (length(x)) trimws(sub("^(S[:]|[:]) +", "", x)) else "",
character(1))
# Clean out everything up to the end of the EXAMPLES
macro <- trimws(sub(".+(?<=/[*] EXAMPLE).*?(?<=[*]/)", "", macro, perl = TRUE))
# MACRO BODY --------------------------------------------------------
# At this point, the body should be the only thing left in `macro`, except
# for the `%mend` call
body <- trimws(sub("%mend(|.+)$", "", macro))
# RESULTS
macro_name # character vector
macro_desc # character vector
param # list of two column matrices
uses # character vector
examples # character vector
body # character vector
答案 1 :(得分:2)
这是一个灵感来自@Benjamin的解决方案(他的解决方案显示了真实数据的一些问题,所以我重新设计了它):
sas2tibble <- function(txt){
# FULL MACROS
# macros all start by '%macro' and finish by '%mend;'
# These keywords are never used elsewhere
pattern <- '%macro (.+?)%mend;'
full_macros <- regmatches(txt,gregexpr(pattern,txt))[[1]] # match pattern
# NAMES
# names are always after %macro and stop before either:
# - description : starting with '/'
# - parameters : starting with '('
# - end of statement : ';'
pattern <- '%macro (.+?)[/\\(;]'
names <- regmatches(full_macros,gregexpr(pattern,full_macros)) # match pattern
names <- trimws(sub(pattern,"\\1",names)) # extract (.+?) from matches and trim
# DESCRIPTIONS
# They are always between %macro and the first ';'
# They are between /*-- and --*/
pattern <- '%macro (.+?)/\\*(-+)(.+?)(-+)\\*/(.+?);'
descriptions <- regmatches(full_macros,gregexpr(pattern,full_macros)) # match pattern
descriptions[lengths(descriptions)==0] <- "" # convert character(0) to ""
descriptions <- trimws(sub(pattern,"\\3",descriptions)) # extract (.+?) from matches and trim
# PARAMETERS
# They are always between %macro and the first ';'
# They are after description if it exists
# They are between '(' and ')'
pattern <- '%macro (.+?)(/\\*(-+)(.+?)(-+)\\*/)*[\n ]*(\\((.+?)\\))(.+?);'
params <- regmatches(full_macros,gregexpr(pattern,full_macros)) # match pattern
params[lengths(params)==0] <- "" # convert character(0) to ""
params <- trimws(sub(pattern,"\\7",params))
pattern <- '/\\*(.+?)\\*/'
param_defs <- sapply(strsplit(params,","),function(x) {
out <- regmatches(x,gregexpr(pattern,x)) # match pattern
out[lengths(out)==0] <- "" # convert character(0) to ""
out <- trimws(sub(pattern,"\\1",out))
})
params <- sapply(strsplit(params,","),function(x) trimws(sub(pattern,"",x)))
# USES
# always between '/* USES:' and next `*/`
pattern <- '/\\* USES:(.+?)\\*/'
uses <- regmatches(full_macros,gregexpr(pattern,full_macros)) # match pattern
uses[lengths(params)==0] <- "" # convert character(0) to ""
uses <- trimws(sub(pattern,"\\1",uses))
# EXAMPLES
# always between '/* EXAMPLE(S):' and next `*/`
pattern <- '/\\* EXAMPLES*:(.+?)\\*/'
examples <- regmatches(full_macros,gregexpr(pattern,full_macros)) # match pattern
examples[lengths(examples)==0] <- "" # convert character(0) to ""
examples <- trimws(sub(pattern,"\\1",examples))
# BODY
# after first ';' but not uses or examples
pattern <- '/\\* USES:(.+?)\\*/'
body <- sub(pattern,"",full_macros)
pattern <- '/\\* EXAMPLES*:(.+?)\\*/'
body <- sub(pattern,"",body)
pattern <- ";(.*?)%mend;"
body <- regmatches(body,gregexpr(pattern,body)) # match pattern
body <- trimws(sub(pattern,"\\1",body)) # extract (.+?) from matches and trim
tibble(full_code = full_macros,
name = names,
desc = descriptions,
params = params,
param_defs = param_defs,
uses = uses,
examples = examples,
body = body)
}
View(sas2tibble(my_text))
我认为最干净的解决方案是找到与一般结构匹配的正则表达式,然后使用\\1
,\\2
等提取相关信息...我不能很遗憾地生产它,但这很好。