我正在尝试通过折叠与第2列匹配的相应列值来重塑两列数据框-在这种情况下,将股票代码符号添加到其自己的唯一行,同时使第1列的内容成为数据的字段对应于那些行情记录器自己的列。下面是我的示例,其中包含一个小样本,因为它是一个具有500个行情自动报价和4个字段的数据框:
# Closed End Fund Selector
url<-"https://www.cefconnect.com/api/v3/DailyPricing?props=Ticker,Name,DistributionRateNAV,LastUpdated,Discount,DistributionRatePrice,ReturnOnNAV,CategoryId,CategoryName,IsManagedDistribution,Price,PriceChange,NAV,NAVPublished,Cusip/&_=1546832481302"
library(jsonlite)
library(rvest)
library(dplyr)
page<-html_session(url)
json<-readBin(page$response$content, what="json")
df<-fromJSON(json)
# Analyze and Group Closed End Funds by Investment Strategy and Average/Min/Max Discounts to NAV
df2 <- df %>% group_by(CategoryName) %>%
summarize(Category_Fund_Avg_NAV_Difference= mean(Discount,na.rm=T),
Min_NAV_Fund = Name[which(Discount == min(Discount))], Min_NAV_Ticker = Ticker[which(Discount == min(Discount))],
Min_Nav_Fund_Difference = min(Discount),
Max_NAV_Fund = Name[which(Discount == max(Discount))],
Max_NAV_Ticker = Ticker[which(Discount == max(Discount))],Max_Nav_Fund_Difference = max(Discount))
df2 <- data.frame(df2)
tickers <- df$Ticker
tickers <- paste0("https://www.cefconnect.com/fund/", tickers)
lst_scraped_data <- lapply(tickers, FUN=function(URLLink){
CEF_Scrape <- read_html(URLLink)
test9 <- CEF_Scrape %>%
html_nodes("#ContentPlaceHolder1_cph_main_cph_main_SummaryGrid") %>%
html_text() %>%
strsplit(split = "\n") %>%
unlist() %>%
.[. != " "]
test9 <- str_replace_all(test9,pattern = "\t", replacement = "")
test9 <- str_replace_all(test9,pattern = "\r", replacement = "")
test9 <- str_trim(test9,side="left")
test9 <- test9[test9 != ""]
test9 <- str_replace_all(test9,pattern="SharePriceNAVPremium/Discount",replacement = "SharePrice NAV Premium/Discount")
test9 <- str_replace_all(test9,pattern="Current",replacement = "Current ")
test9 <- str_replace_all(test9,pattern="52 Wk Avg",replacement = "52WkAvg ")
test9 <- str_replace_all(test9,pattern="52 Wk High",replacement = "52WkHigh ")
test9 <- str_replace_all(test9,pattern="52 Wk Low",replacement = "52WkLow ")
test9 <- str_replace_all(test9,pattern="-",replacement = " -")
#test9 <- str_replace_all(test9,pattern="$",replacement = " $")
# The below two lines are needed for edge cases where the premium/discount % is positive or negative
#test9 <- sub("\\s", "", gsub('(\\$.{2})', '\\1 ', test9))
#test9 <- gsub("(?<=[0-9])(?=[$])", " ", test9, perl = TRUE)
#test9 <- str_replace_all(test9,pattern="$ ",replacement = "$")
test9 <- gsub('(\\.\\d{2})', '\\1 ', test9, perl = T)
test9 <- trimws(gsub('\\s%', '% ', test9))
# This creates a space between prices based on a number followed by a '$'
test9 <- gsub("([0-9])([$])", "\\1 \\2", test9)
# This creates a space for the % values (usually a - sign)
#test9 <- gsub("(?=[-])", " ", test9, perl = TRUE)
#test9 <- gsub("(?<=\\.\\d{2})(?!%)", " ", test9, perl = TRUE)
#test <- sub("((?<=\\.\\d{2})|(?<=%))(?=[\\d$-])", " ", test9, perl = TRUE)
#gsub("(?<=[.])", " ", test9, perl = TRUE)
# Separate by whitespace into unique elements each word so as to create a column vector for indexing
test9 <- unlist(sapply(test9, strsplit, "\\s+", USE.NAMES = FALSE))
test10 <- paste(test9[4],test9[1],test9[5])
test11 <- paste(test9[4],test9[2],test9[6])
test12 <- paste(test9[4],test9[3],test9[7])
test13 <- paste(test9[8],test9[1],test9[9])
test14 <- paste(test9[8],test9[2],test9[10])
test15 <- paste(test9[8],test9[3],test9[11])
test16 <- paste(test9[12],test9[1],test9[13])
test17 <- paste(test9[12],test9[2],test9[14])
test18 <- paste(test9[12],test9[3],test9[15])
test19 <- paste(test9[16],test9[1],test9[17])
test20 <- paste(test9[16],test9[2],test9[18])
test21 <- paste(test9[16],test9[3],test9[19])
test22 <- c(test10, test11, test12, test13, test14, test15, test16, test17, test18, test19, test20, test21)
Ticker <- str_replace_all(URLLink,pattern="https://www.cefconnect.com/fund/",replacement = "")
Checker = data.frame(test22,Ticker)
})
df6 <- do.call(rbind, lst_scraped_data)
理想情况下,重新格式化转换后的行情栏是唯一的行,行名位于行名中,在这种情况下,包含12个对应的列,其中包含“ df6”的“ test22”列的内容,但它们各自的名称位于这个阶段还不重要。非常感谢您的帮助!
这是上述脚本输出的前19行的示例:
test22 Ticker
1 Current SharePrice $6.57 MFM
2 Current NAV $7.11 MFM
3 Current Premium/Discount -7.59% MFM
4 52WkAvg SharePrice $6.55 MFM
5 52WkAvg NAV $7.21 MFM
6 52WkAvg Premium/Discount -9.19% MFM
7 52WkHigh SharePrice $6.88 MFM
8 52WkHigh NAV $7.34 MFM
9 52WkHigh Premium/Discount -5.88% MFM
10 52WkLow SharePrice $6.05 MFM
11 52WkLow NAV $7.03 MFM
12 52WkLow Premium/Discount -14.43% MFM
13 Current SharePrice $4.84 CXE
14 Current NAV $5.21 CXE
15 Current Premium/Discount -7.10% CXE
16 52WkAvg SharePrice $4.91 CXE
17 52WkAvg NAV $5.29 CXE
18 52WkAvg Premium/Discount -7.26% CXE
19 52WkHigh SharePrice $5.31 CXE
20 52WkHigh NAV $5.37 CXE
21 52WkHigh Premium/Discount -1.12% CXE
22 52WkLow SharePrice $4.58 CXE
23 52WkLow NAV $5.16 CXE
24 52WkLow Premium/Discount -11.92% CXE
25 Current SharePrice $4.33 CMU
这就是我希望重新格式化后的输出的样子:
使用dput之后->
test22 Ticker
1当前股价$ 6.57 MFM 2当前资产净值$ 7.11 MFM 3当前的高级/折扣-7.59%MFM 4 52WkAvg股价$ 6.55 MFM 5 52WkAvg资产净值$ 7.21 MFM 6 52WkAvg Premium / Discount -9.19%MFM 7 52WkHigh SharePrice $ 6.88 MFM 8 52WkHigh资产净值$ 7.34 MFM 9 52WkHigh高级/折扣-5.88%MFM 10 52WkLow SharePrice $ 6.05 MFM 11 52WkLow NAV $ 7.03 MFM 12 52WkLow Premium / Discount -14.43%MFM 13当前股价$ 4.84 CXE 14当前资产净值$ 5.21 CXE 15当前的溢价/折扣-7.10%CXE 16 52WkAvg股价$ 4.91 CXE 17 52WkAvg资产净值$ 5.29 CXE 18 52WkAvg Premium / Discount -7.26%CXE 19 52WkHigh SharePrice $ 5.31 CXE