我有URL记录,我想提取其中的一部分并创建新列。在我的示例中,我想将"groups"
之后的数字视为group_id
,将dicussion_topics
的数字视为discussion_id
df喜欢:
user url
1 "https://test.com/groups/3276/discussion_topics/3939"
2 "https://test.com/groups/34/discussion_topics/11"
3 "https://test.com/groups/3276"
4 "https://test.com/groups/other"
我想要类似的结果
user group_id dicussion_id
1 3276 3939
2 34 11
3 3276 NA
4 NA NA
我该如何使用R中的正则表达式呢?
答案 0 :(得分:3)
dat$group_id=as.numeric(sub(".*/groups/(\\d+).*|.*","\\1",dat$url))
dat$discussion=as.numeric(sub(".*/discussion_topics/(\\d+).*|.*","\\1",dat$url))
dat
user url group_id discussion
1 1 https://test.com/groups/3276/discussion_topics/3939 3276 3939
2 2 https://test.com/groups/34/discussion_topics/11 34 11
3 3 https://test.com/groups/3276 3276 NA
4 4 https://test.com/groups/other NA NA
答案 1 :(得分:1)
带有stringi
软件包和正则表达式的最新版本
更新:诚然,@ Onyambu的功能更快。参见基准。 Update2:添加了第三个版本的基准测试。速度没有改善。
library(stringi)
extract_info = function(x) {
x$group = stri_extract_all_regex(x$url, "(?<=groups/)\\d+")
x$topic = stri_extract_all_regex(x$url, "(?<=discussion_topics/)\\d+")
x
}
extract_info(dat)
# user url group topic
# 1 1 https://test.com/groups/3276/discussion_topics/3939 3276 3939
# 2 2 https://test.com/groups/34/discussion_topics/11 34 11
# 3 3 https://test.com/groups/3276 3276 NA
# 4 4 https://test.com/groups/other NA NA
extract_info2 = function(dat) {
dat$group_id=as.numeric(sub(".*/groups/(\\d+).*|.*","\\1",dat$url))
dat$discussion=as.numeric(sub(".*/discussion_topics/(\\d+).*|.*","\\1",dat$url))
dat
}
extract_info3 = function(data) {
df$group_id <- as.numeric(regmatches(df$url, gregexpr(".*groups/*\\K.\\d+", df$url, perl=TRUE)))
df$discussion <- as.numeric(regmatches(df$url, gregexpr(".*topics/*\\K.\\d+", df$url, perl=TRUE)))
df
}
microbenchmark::microbenchmark(
extract_info(dat)
,extract_info2(dat)
,extract_info3(dat)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# extract_info(dat) 152.769 160.269 172.1629 170.5325 176.0590 300.011 100
# extract_info2(dat) 99.872 106.386 120.9876 117.2415 125.7285 226.981 100
# extract_info3(dat) 285.799 301.984 378.7235 308.8925 323.3000 6684.297 100
答案 2 :(得分:1)
这是另一种选择:
df$group_id <- as.numeric(regmatches(df$url, gregexpr(".*groups/*\\K.\\d+", df$url, perl=TRUE)))
df$discussion <- as.numeric(regmatches(df$url, gregexpr(".*topics/*\\K.\\d+", df$url, perl=TRUE)))