答案

Question

我有一个文件，其中包含不同类型的日期格式及其ID。我需要使用正则表达式提取所有字符串。

df <- data.frame(id=1:8,text=c("deficit based on wage statement 7/14/ to 7/17/2015.",
                    "Deficit Due: $1205.73 -$879.63= $326.10 x 70%=$228.2.",
                    "Deficit Due for 12 wks pd - 7/14/15 thru 10/5/15;Deficit due to wage,", 
                    "statement: 4/22/15 thru 5/12/15,depos transcript 7/10/15 for 7/8/15 depos,",
                    "difference owed for 4/25/15-5/22/15 10-29-99 Feb. 25, 2009,",
                    "tpd 4:30:2015 - 5:22:2015--09/26/99, 7-14 1.3.99, 1.3.1999,",
                    "Medical TREATMENT DATES:  6/30/2015 -  30/06/2015 09/26/1999,",
                    "4/25/15-5/22/15,Medical 2010-01-29 **2010-01-30 February25,2009, February 25, 2009"))

到目前为止，我已经使用多个OR语句创建了正则表达式。

    #Different string patterns
#all day formats
day<-c(31:1,"01","02","03","04","05","06","07","08","09")
day_p<-paste(day,collapse = "|")
day_p <- paste0("(",day_p,")")

#all month formats
month<-c(12:1,"01","02","03","04","05","06","07","08","09")
month_p<-paste(month,collapse="|")
month_p <- paste0("(",month_p,")")

#all year 4 digit formats
year<-"\\d{4}"
year_p<-paste(year,collapse="|")
year_p <- paste0("(",year_p,")")

#all year 2 digit formats
year_i<-"\\d{2}"
year_i_p<-paste(year_i,collapse="|")
year_i_p <- paste0("(",year_i_p,")")

#all seperator symbol
symbol_p<-paste(c("\\.","\\|","\\/","\\-","\\:","\\,"),collapse="|")
symbol_p <- paste0("(",symbol_p,")")


patterns<-paste0("(",month_p,symbol_p,day_p,symbol_p,year_p,")","|",
             "(",day_p,symbol_p,month_p,symbol_p,year_p,")","|",
             "(",year_p,symbol_p,month_p,symbol_p,day_p,")","|",
             "(",month_p,symbol_p,day_p,symbol_p,year_i_p,")","|",
             "(",day_p,symbol_p,month_p,symbol_p,year_i_p,")","|",
             "(",year_i_p,symbol_p,month_p,symbol_p,day_p,")","|",
             "(",month_p,"\\-",day_p,")","|",
             "(",day_p,"\\-",month_p,")","|",
             "(",month_p,"\\/",day_p,")","|",
             "(",day_p,"\\/",month_p,")")

#String extaction
extract= str_extract_all(df$text,patterns)

是否有方法将所有正则表达式规则放在数据框中，命名每个规则并进行字符串提取？

#regex patterns in a data frame 
df_patterns<-data.frame(pattern=c(paste0("(",month_p,symbol_p,day_p,symbol_p,year_p,")"),
                                  paste0("(",day_p,symbol_p,month_p,symbol_p,year_p,")")),
                        rule=c(1,2))

输出数据框应包括提取值和触发其提取的规则。

#output data frame
output<-data.frame(id=c(1,1,2,3,3),string=c("7/14","7/17/2015",NA,"7/14/15","10/5/15"),rule=c(9,1,NA,2,3))

Answer 1

stringr有一个名为str_match_all的函数，它可以提取所有匹配项，并返回在不同列中匹配的捕获组。这对于这个问题很方便，因为您可以命名捕获组并将它们与str_match_all的每一列输出相关联：

#Different string patterns
#all day formats
day_p <- "[0-3]?[0-9]"

#all month formats
month_p <- "[0-1]?[0-9]"

#all year 4 digit formats
year_p <- "\\d{4}"

#all year 2 digit formats
year_i_p <- "\\d{2}"

#all seperator symbol
symbol_p <- "[-/:.]"

# Patterns to match structured as combination of capture groups
patterns<-paste0("(",month_p,symbol_p,day_p,symbol_p,year_p,")","|",
                 "(",day_p,symbol_p,month_p,symbol_p,year_p,")","|",
                 "(",year_p,symbol_p,month_p,symbol_p,day_p,")","|",
                 "(",month_p,symbol_p,day_p,symbol_p,year_i_p,")","|",
                 "(",day_p,symbol_p,month_p,symbol_p,year_i_p,")","|",
                 "(",year_i_p,symbol_p,month_p,symbol_p,day_p,")","|",
                 "(",month_p,"[-]",day_p,")","|",
                 "(",day_p,"[-]",month_p,")","|",
                 "(",month_p,"[/]",day_p,")","|",
                 "(",day_p,"[/]",month_p,")","|",
                 "(", "\\w+[.]?[\\s]?\\d+[,]\\s?",year_p,")")

# Name the capture groups
rule_names = c("MDYYYY", "DMYYYY", 
               "YYYYMD", "MDYY", 
               "DMYY", "YYMD", 
               "MD_dash", "DM_dash", 
               "MD_slash", "DM_slash",
               "MDYYYY_word")

library(dplyr)
library(tidyr)
library(purrr)
df$text %>%
  str_match_all(patterns) %>%
  map2(df$id, function(x, y){
    if(nrow(x) == 0){
      x = rbind(x, NA)
    }
    data.frame(id = y, x)
  }) %>%
  do.call(rbind, .) %>%
  mutate_at(vars(X2:X11), funs(ifelse(!is.na(.), 1, NA))) %>%
  setNames(c("id", "string", rule_names)) %>%
  gather(rule, value, -id, -string) %>%
  na.omit() %>%
  select(-value) %>%
  arrange(id)

备注：

这最后一部分完成了所有工作。 str_match_all返回一个列表，每个元素都包含匹配字符矩阵和每个df$text值的捕获组。

map2将id与字符矩阵绑定，以便每行引用id + match组合。 if语句检查元素是否没有匹配，如果是这种情况，则检查rbinds NA值。这允许id至少有一行绑定到。

mutate_at转换每个＆＃34; capture_group＆＃34;列到虚拟变量，指示＆＃34;此捕获组是否匹配＆＃34;

使用rule_names重命名捕获组列，并将所有虚拟变换为单个分类变量。

重要提示是，无法知道＆＃34; 5/6/2015＆＃34;是MDYYYY或DMYYYY格式，因此在这种情况下，您必须订购patterns以使其中一个优先（例如，如果MDYYYY在patterns中的DMYYYY之前，则MDYYYY将首先匹配＆＃34 5 /2015分之6＆＃34）

<强>结果：

id string rule 1 1 7/17/2015 MDYYYY 2 1 7/14 MD_slash 3 3 7/14/15 MDYY 4 3 10/5/15 MDYY 5 4 4/22/15 MDYY 6 4 5/12/15 MDYY 7 4 7/10/15 MDYY 8 4 7/8/15 MDYY 9 5 4/25/15 MDYY 10 5 5/22/15 MDYY 11 5 10-29-99 MDYY 12 5 Feb. 25, 2009 MDYYYY_word 13 6 4:30:2015 MDYYYY 14 6 5:22:2015 MDYYYY 15 6 1.3.1999 MDYYYY 16 6 09/26/99 MDYY 17 6 1.3.99 MDYY 18 6 7-14 MD_dash 19 7 6/30/2015 MDYYYY 20 7 09/26/1999 MDYYYY 21 7 30/06/2015 DMYYYY 22 8 2010-01-29 YYYYMD 23 8 2010-01-30 YYYYMD 24 8 4/25/15 MDYY 25 8 5/22/15 MDYY 26 8 February25,2009 MDYYYY_word 27 8 February 25, 2009 MDYYYY_word

Answer 2

答案

简

如果我错了，请纠正我，但我相信R确实支持PCRE正则表达式。在这种情况下，您可以使用以下正则表达式来捕获您指定格式的任何日期。

代码

See this regex in use here

(?(DEFINE)
  (?# Definitions )
  (?<day>[12]\d|3[01]|0?[1-9])
  (?<month>1[0-2]|0?[1-9])
  (?<year>\d+)
  (?<separator>[.|\/:,-])
  (?# Date formats )
  (?<mdy>(?&month)(?<mdy_1>(?&separator))(?&day)(?&mdy_1)(?&year))
  (?<dmy>(?&day)(?<dmy_1>(?&separator))(?&month)(?&dmy_1)(?&year))
  (?<ymd>(?&year)(?<ymd_1>(?&separator))(?&month)(?&ymd_1)(?&day))
  (?<md>(?&month)(?<md_1>(?&separator))(?&day)(?&md_1)?)
  (?<dm>(?&day)(?<dm_1>(?&separator))(?&month)(?&dm_1)?)
  (?# Date )
  (?<date>(?&mdy)|(?&dmy)|(?&ymd)|(?&md)|(?&dm))
)
(?<=\b|\s)(?&date)(?=\b|\s)

说明

define块指定我们对构成天，月，年，分隔符的所有定义。它还定义了我们的日期格式（ mdy ， dmy ， ymd ， md ， dm ）。最后，它定义了我们的 date 组，它是我们所有日期格式之间的简单 OR 。

最终的正则表达式只是指定前面或后面的标记应该是单词边界字符\b或空白字符\s（在最后一个字符是单词边界字符的情况下，这里添加了空格，它也将捕获最终字符 - 您可以通过删除最终正则表达式中的|\s来查看结果，从第一个匹配项开始测试。

请注意，这假设一个月的日期可以达到31（更具体的检查会导致一个非常冗长的正则表达式，并且当您可以通过代码验证它时似乎毫无意义。）

结果

输入

deficit based on wage statement 7/14/ to 7/17/2015.
Deficit Due: $1205.73 -$879.63= $326.10 x 70%=$228.2.
Deficit Due for 12 wks pd - 7/14/15 thru 10/5/15;Deficit due to wage,
statement: 4/22/15 thru 5/12/15,depos transcript 7/10/15 for 7/8/15 depos,
difference owed for 4/25/15-5/22/15 10-29-99 Feb. 25, 2009,
tpd 4:30:2015 - 5:22:2015--09/26/99, 7-14 1.3.99, 1.3.1999,
Medical TREATMENT DATES:  6/30/2015 -  30/06/2015 09/26/1999,
4/25/15-5/22/15,Medical 2010-01-29 **2010-01-30 February25,2009, February 25, 2009

输出

7/14/
7/17/2015
7/14/15
10/5/15
4/22/15
5/12/15
7/10/15
7/8/15
4/25/15
5/22/15
10-29-99
4:30:2015
5:22:2015
09/26/99
7-14
1.3.99
1.3.1999
6/30/2015
30/06/2015
09/26/1999
4/25/15
5/22/15
2010-01-29
2010-01-30

编辑

代码

See this code in use here

(?(DEFINE)
  (?# Definitions )
  (?<day>[12]\d|3[01]|0?[1-9])
  (?<month>1[0-2]|0?[1-9])
  (?<year>\d+)
  (?<separator>[.|\/:,-])
  (?# Date formats )
  (?<f_mdy>(?&month)(?<mdy_1>(?&separator))(?&day)(?&mdy_1)(?&year))
  (?<f_dmy>(?&day)(?<dmy_1>(?&separator))(?&month)(?&dmy_1)(?&year))
  (?<f_ymd>(?&year)(?<ymd_1>(?&separator))(?&month)(?&ymd_1)(?&day))
  (?<f_md>(?&month)(?<md_1>(?&separator))(?&day)(?&md_1)?)
  (?<f_dm>(?&day)(?<dm_1>(?&separator))(?&month)(?&dm_1)?)
  (?<f_Mdy>(?:jan(?:uary|\.)?|feb(?:ruary|\.)?|mar(?:ch|\.)?|apr(?:il|\.)?|may|jun(?:e|\.)?|jul(?:y|\.)?|aug(?:ust|\.)?|sep(?:tember|\.)?|oct(?:ober|\.)?|nov(?:ember|\.)?|dec(?:ember|\.)?)\s*(?&day)(?:\s*(?&separator)|(?&separator)\s*|\s+)(?&year))
)
(?<=\b|\s)(?:(?<mdy>(?&f_mdy))|(?<dmy>(?&f_dmy))|(?<ymd>(?&f_ymd))|(?<md>(?&f_md))|(?<dm>(?&f_dm))|(?<Mdy>(?&f_Mdy)))(?=\b|\s)

这会将捕获设置为命名捕获组。如果查看链接中的输出，您将看到具有匹配内容的命名组。

R - 如何循环多个正则表达式规则而不是OR语句？

2 个答案:

答案

简

代码

说明

结果

输入

输出

编辑

代码