我有一系列步骤要转换为函数,因此我只需通过调用它就可以将它应用于数据框。以下是带有一些注释的代码:
library("textreadr")
library("pdftools")
library("tidyverse")
library("tidytext")
library("textreadr")
library("tm")
# Create Data frame
Off_let_data <- data.frame(page_id = c(3,3,3,3,3), element_id = c(19, 22, 26, 31, 31),
text = c("The Protected Percentage of your property value thats has been chosen is 0%",
"The Arrangement Fee payable at complettion: £50.00",
"The Fixed Interest Rate that is applied for the life of the period is: 5.40%",
"The Benchmark rate that will be used to calculate any early repayment 2.08%",
"The property value used in this scenario is 275,000.00"))
# read in the first element of a list of pdf file from a folder
files <- list.files(pattern = "pdf$")[1]
# extract the account number from the first pdf file
acc_num <- str_extract(files, "^\\d+")
# The RegEx's used to extract the relevant information
protec_per_reg <- "Protected\\sP\\w+\\sof"
Arr_Fee_reg <- "^The\\sArrangement\\sF\\w+"
Fix_inter_reg <- "Fixed\\sI\\w+\\sR\\w+"
Bench_rate_reg <- "Benchmark\\sR\\w+\\sthat"
# create a df that only includes the rows which match the above RegEx
Off_let <- Off_let_data %>% filter(page_id == 3, str_detect(Off_let_data$text, protec_per_reg)|
str_detect(Off_let_data$text, Arr_Fee_reg) | str_detect(Off_let_data$text, Fix_inter_reg) |
str_detect(Off_let_data$text, Bench_rate_reg))
# Now only extract the numbers from the above DF
off_let_num <- str_extract(Off_let$text, "\\d+\\.?\\d+")
# The first element is always a NA value - based on the structure of these PDF files
# replace the first element of this character vector with the below
off_let_num[is.na(off_let_num)] <- str_extract(Off_let$text, "\\d+%")[[1]]
off_let_num
有人可以帮助我把它变成一个功能。感谢
答案 0 :(得分:1)
这样的东西?
该功能的输入/输出应该是什么?目前,该函数仅将data.frame作为参数,但您可以对其进行扩展,因此您可以传递不同的正则表达式,或者例如定义page_id。
library("textreadr")
library("pdftools")
library("tidyverse")
library("tidytext")
library("textreadr")
library("tm")
# Create Data frame
Off_let_data <- data.frame(page_id = c(3,3,3,3,3), element_id = c(19, 22, 26, 31, 31),
text = c("The Protected Percentage of your property value thats has been chosen is 0%",
"The Arrangement Fee payable at complettion: £50.00",
"The Fixed Interest Rate that is applied for the life of the period is: 5.40%",
"The Benchmark rate that will be used to calculate any early repayment 2.08%",
"The property value used in this scenario is 275,000.00"))
dummyFunc <- function(df) {
# read in the first element of a list of pdf file from a folder
files <- list.files(pattern = "pdf$")[1]
# extract the account number from the first pdf file
acc_num <- str_extract(files, "^\\d+")
# The RegEx's used to extract the relevant information
protec_per_reg <- "Protected\\sP\\w+\\sof"
Arr_Fee_reg <- "^The\\sArrangement\\sF\\w+"
Fix_inter_reg <- "Fixed\\sI\\w+\\sR\\w+"
Bench_rate_reg <- "Benchmark\\sR\\w+\\sthat"
# create a df that only includes the rows which match the above RegEx
Off_let <- df %>% filter(page_id == 3, str_detect(df$text, protec_per_reg)|
str_detect(df$text, Arr_Fee_reg) | str_detect(df$text, Fix_inter_reg) |
str_detect(df$text, Bench_rate_reg))
# Now only extract the numbers from the above DF
off_let_num <- str_extract(Off_let$text, "\\d+\\.?\\d+")
# The first element is always a NA value - based on the structure of these PDF files
# replace the first element of this character vector with the below
off_let_num[is.na(off_let_num)] <- str_extract(Off_let$text, "\\d+%")[[1]]
return(off_let_num)
}
dummyFunc(Off_let_data)
对于该功能的更广泛版本:
# The RegEx's used to extract the relevant information
protec_per_reg <- "Protected\\sP\\w+\\sof"
Arr_Fee_reg <- "^The\\sArrangement\\sF\\w+"
Fix_inter_reg <- "Fixed\\sI\\w+\\sR\\w+"
Bench_rate_reg <- "Benchmark\\sR\\w+\\sthat"
regexprlist <- list(protec_per_reg, Arr_Fee_reg,
Fix_inter_reg, Bench_rate_reg)
dummyFuncExt <- function(df, regexp, page_id) {
# read in the first element of a list of pdf file from a folder
files <- list.files(pattern = "pdf$")[1]
# extract the account number from the first pdf file
acc_num <- str_extract(files, "^\\d+")
# create a df that only includes the rows which match the above RegEx
Off_let <- df %>% filter(page_id == page_id, str_detect(df$text, regexprlist[[1]])|
str_detect(df$text, regexprlist[[2]]) | str_detect(df$text, regexprlist[[3]]) |
str_detect(df$text, regexprlist[[4]]))
# Now only extract the numbers from the above DF
off_let_num <- str_extract(Off_let$text, "\\d+\\.?\\d+")
# The first element is always a NA value - based on the structure of these PDF files
# replace the first element of this character vector with the below
off_let_num[is.na(off_let_num)] <- str_extract(Off_let$text, "\\d+%")[[1]]
return(off_let_num)
}
dummyFuncExt(df = Off_let_data, regexp = regexprlist, page_id = 3)