使用R从图表中提取数据

时间:2019-02-25 18:49:01

标签: r

嗨,我有以下R代码始终从网络表中提取最新季度的数据(下表中的红色),并且效果很好。 EPS Table

但是,我现在想获取最近三个季度的所有数据,但是我无法自行调整代码。

有人可以让我知道需要调整/添加哪些内容吗?

  # Find and extract quarterly results
  for (i in 1:length(data1_3)){
    if (grepl('Quarterly',data1_3[[i]][1,1])){
      data1_3 <- data1_3[[i]][5:nrow(data1_3[[i]]),]
    }
  }
  data1_3 <- data1_3[data1_3[,1] != 'Announcement Date',]
  data1_3 <- data1_3[, colSums(is.na(data1_3)) != nrow(data1_3)]
  est_cnt <- sum(grepl("(e)", data1_3[1,], fixed = TRUE))

  # Get table headers
  data1_3_items <- webpage3 %>% html_nodes('.rtTableItems') %>% html_text()
  data1_3_items <- data1_3_items[data1_3_items != 'Announcement Date']

  # Get latest released sales and EPS
  data2_3 <- webpage3 %>% html_nodes('.rtPubl') %>% html_text()
  diff_rel <- grep('Released', data2_3)[2] - grep('Released', data2_3)[1]
  quart_sales_start <- (grep("Sales",data1_3_items)[2] - 1)*diff_rel + 1
  data2_3 <- data2_3[quart_sales_start:length(data2_3)]

  latest_quart_sales_rel_pos <- (grep('Sales', data1_3[,1]) - 1)*(diff_rel)
  latest_quart_eps_rel_pos <- (grep('EPS', data1_3[,1]) - 1)*(diff_rel)

  latest_quart_sales_released <- as.numeric(gsub(" ", "", data2_3[latest_quart_sales_rel_pos]))
  latest_quart_eps_released <- (gsub(",", ".", data2_3[latest_quart_eps_rel_pos]))
  latest_quart_eps_released <- as.numeric(gsub(" ", "", latest_quart_eps_released))

  # Get latest forecast sales and EPS
  data3_3 <- webpage3 %>% html_nodes('.rtPrev') %>% html_text()
  diff_for <- grep('Forecast', data3_3)[2] - grep('Forecast', data3_3)[1]
  quart_sales_start <- (grep("Sales",data1_3_items)[2] - 1)*diff_for + 1
  data3_3 <- data3_3[quart_sales_start:length(data3_3)]

  latest_quart_sales_for_pos <- (grep('Sales', data1_3[,1]) - 1)*(diff_for) - est_cnt
  latest_quart_eps_for_pos <- (grep('EPS', data1_3[,1]) - 1)*(diff_for) - est_cnt

  latest_quart_sales_forecast <- as.numeric(gsub(" ", "", data3_3[latest_quart_sales_for_pos]))
  latest_quart_eps_forecast <- (gsub(",", ".", data3_3[latest_quart_eps_for_pos]))
  latest_quart_eps_forecast <- as.numeric(gsub(" ", "", latest_quart_eps_forecast))

  # Get latest sales and EPS spread
  data1_3[2:nrow(data1_3),3:ncol(data1_3)] <- 
lapply(data1_3[2:nrow(data1_3),3:ncol(data1_3)], FUN = function(x) gsub(" ", "", x))
  data1_3[2:nrow(data1_3),3:ncol(data1_3)] <- lapply(data1_3[2:nrow(data1_3),3:ncol(data1_3)], FUN = function(x) gsub(",", ".", x))
  data4_3 <- data.frame(matrix(unlist(lapply(data1_3, FUN = function(x) gsub(latest_quart_sales_released,"",x))), nrow = nrow(data1_3), byrow = FALSE))
  data4_3 <- data.frame(matrix(unlist(lapply(data4_3, FUN = function(x) gsub(latest_quart_sales_forecast,"",x))), nrow = nrow(data4_3), byrow = FALSE))
  data4_3 <- data.frame(matrix(unlist(lapply(data4_3, FUN = function(x) gsub(gsub(" ", "", gsub(",",".",data2_3[latest_quart_eps_rel_pos])),"",x))), nrow = nrow(data4_3), byrow = FALSE))
  data4_3 <- data.frame(matrix(unlist(lapply(data4_3, FUN = function(x) gsub(gsub(" ", "", gsub(",",".",data3_3[latest_quart_eps_for_pos])),"",x))), nrow = nrow(data4_3), byrow = FALSE))

  col_num <- grep("(e)", data1_3[1,], fixed = TRUE)[1] - 1
  sales_row_num <- grep("Sales", data1_3[,1])[1]
  eps_row_num <- grep("EPS", data1_3[,1])[1]

  latest_quart_sales_spread_perc <- as.numeric(gsub("%","",data4_3[sales_row_num, col_num]))
  latest_quart_eps_spread_perc <- as.numeric(gsub("%","",data4_3[eps_row_num, col_num]))

0 个答案:

没有答案