嗨,我有以下R代码始终从网络表中提取最新季度的数据(下表中的红色),并且效果很好。
但是,我现在想获取最近三个季度的所有数据,但是我无法自行调整代码。
有人可以让我知道需要调整/添加哪些内容吗?
# Find and extract quarterly results
for (i in 1:length(data1_3)){
if (grepl('Quarterly',data1_3[[i]][1,1])){
data1_3 <- data1_3[[i]][5:nrow(data1_3[[i]]),]
}
}
data1_3 <- data1_3[data1_3[,1] != 'Announcement Date',]
data1_3 <- data1_3[, colSums(is.na(data1_3)) != nrow(data1_3)]
est_cnt <- sum(grepl("(e)", data1_3[1,], fixed = TRUE))
# Get table headers
data1_3_items <- webpage3 %>% html_nodes('.rtTableItems') %>% html_text()
data1_3_items <- data1_3_items[data1_3_items != 'Announcement Date']
# Get latest released sales and EPS
data2_3 <- webpage3 %>% html_nodes('.rtPubl') %>% html_text()
diff_rel <- grep('Released', data2_3)[2] - grep('Released', data2_3)[1]
quart_sales_start <- (grep("Sales",data1_3_items)[2] - 1)*diff_rel + 1
data2_3 <- data2_3[quart_sales_start:length(data2_3)]
latest_quart_sales_rel_pos <- (grep('Sales', data1_3[,1]) - 1)*(diff_rel)
latest_quart_eps_rel_pos <- (grep('EPS', data1_3[,1]) - 1)*(diff_rel)
latest_quart_sales_released <- as.numeric(gsub(" ", "", data2_3[latest_quart_sales_rel_pos]))
latest_quart_eps_released <- (gsub(",", ".", data2_3[latest_quart_eps_rel_pos]))
latest_quart_eps_released <- as.numeric(gsub(" ", "", latest_quart_eps_released))
# Get latest forecast sales and EPS
data3_3 <- webpage3 %>% html_nodes('.rtPrev') %>% html_text()
diff_for <- grep('Forecast', data3_3)[2] - grep('Forecast', data3_3)[1]
quart_sales_start <- (grep("Sales",data1_3_items)[2] - 1)*diff_for + 1
data3_3 <- data3_3[quart_sales_start:length(data3_3)]
latest_quart_sales_for_pos <- (grep('Sales', data1_3[,1]) - 1)*(diff_for) - est_cnt
latest_quart_eps_for_pos <- (grep('EPS', data1_3[,1]) - 1)*(diff_for) - est_cnt
latest_quart_sales_forecast <- as.numeric(gsub(" ", "", data3_3[latest_quart_sales_for_pos]))
latest_quart_eps_forecast <- (gsub(",", ".", data3_3[latest_quart_eps_for_pos]))
latest_quart_eps_forecast <- as.numeric(gsub(" ", "", latest_quart_eps_forecast))
# Get latest sales and EPS spread
data1_3[2:nrow(data1_3),3:ncol(data1_3)] <-
lapply(data1_3[2:nrow(data1_3),3:ncol(data1_3)], FUN = function(x) gsub(" ", "", x))
data1_3[2:nrow(data1_3),3:ncol(data1_3)] <- lapply(data1_3[2:nrow(data1_3),3:ncol(data1_3)], FUN = function(x) gsub(",", ".", x))
data4_3 <- data.frame(matrix(unlist(lapply(data1_3, FUN = function(x) gsub(latest_quart_sales_released,"",x))), nrow = nrow(data1_3), byrow = FALSE))
data4_3 <- data.frame(matrix(unlist(lapply(data4_3, FUN = function(x) gsub(latest_quart_sales_forecast,"",x))), nrow = nrow(data4_3), byrow = FALSE))
data4_3 <- data.frame(matrix(unlist(lapply(data4_3, FUN = function(x) gsub(gsub(" ", "", gsub(",",".",data2_3[latest_quart_eps_rel_pos])),"",x))), nrow = nrow(data4_3), byrow = FALSE))
data4_3 <- data.frame(matrix(unlist(lapply(data4_3, FUN = function(x) gsub(gsub(" ", "", gsub(",",".",data3_3[latest_quart_eps_for_pos])),"",x))), nrow = nrow(data4_3), byrow = FALSE))
col_num <- grep("(e)", data1_3[1,], fixed = TRUE)[1] - 1
sales_row_num <- grep("Sales", data1_3[,1])[1]
eps_row_num <- grep("EPS", data1_3[,1])[1]
latest_quart_sales_spread_perc <- as.numeric(gsub("%","",data4_3[sales_row_num, col_num]))
latest_quart_eps_spread_perc <- as.numeric(gsub("%","",data4_3[eps_row_num, col_num]))