我正在尝试将XML页面webscrape到数据框中以创建表格 https://www.treasury.gov/resource-center/data-chart-center/interest-rates/Pages/TextView.aspx?data=yield
我试过了:
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="para[emph[emph]]">
<para>
<xsl:value-of select="."></xsl:value-of>
</para>
</xsl:template>
<xsl:template match="para[emph]">
<para>
<xsl:value-of select="."></xsl:value-of>
</para>
</xsl:template>
<xsl:template match="para[emph[2]]">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
但我无法让它正常工作。 谢谢你的帮助。
答案 0 :(得分:5)
我更喜欢使用包rvest
,所以试试这个
if(!require("rvest")){install.packages("rvest");library("rvest")}
url <- "https://www.treasury.gov/resource-center/data-chart-center/interest-rates/Pages/TextView.aspx?data=yield"
xml_page <- read_html(url)
detail <- xml_page %>%
html_nodes(".text_view_data") %>% #node of the table
html_text()
> detail
[1] "11/01/17" "1.06" "1.18" "1.30" "1.46" "1.61" "1.74" "2.01"
[9] "2.22" "2.37" "2.63" "2.85" "11/02/17" "1.02" "1.17" "1.29"
[17] "1.46" "1.61" "1.73" "2.00" "2.21" "2.35" "2.61" "2.83"
[25] "11/03/17" "1.02" "1.18" "1.31" "1.49" "1.63" "1.74" "1.99"
[33] "2.19" "2.34" "2.59" "2.82" "11/06/17" "1.03" "1.19" "1.30"
[41] "1.50" "1.61" "1.73" "1.99" "2.17" "2.32" "2.58" "2.80"
[49] "11/07/17" "1.05" "1.22" "1.33" "1.49" "1.63" "1.75" "1.99"
[57] "2.17" "2.32" "2.56" "2.77" "11/08/17" "1.05" "1.23" "1.35"
[65] "1.53" "1.65" "1.77" "2.01" "2.19" "2.32" "2.57" "2.79"
[73] "11/09/17" "1.07" "1.24" "1.36" "1.53" "1.63" "1.75" "2.01"
[81] "2.20" "2.33" "2.59" "2.81" "11/10/17" "1.06" "1.23" "1.37"
[89] "1.54" "1.67" "1.79" "2.06" "2.27" "2.40" "2.67" "2.88"
[97] "11/13/17" "1.07" "1.24" "1.37" "1.55" "1.70" "1.82" "2.08"
[105] "2.27" "2.40" "2.67" "2.87" "11/14/17" "1.06" "1.26" "1.40"
[113] "1.55" "1.68" "1.81" "2.06" "2.26" "2.38" "2.64" "2.84"
然后,您必须使其适应您需要的格式
这显然不是一种优雅的方式,但它有效。
table_names<-c("Date","1 Mo","3 Mo", "6 Mo", "1 Yr", "2 Yr", "3 Yr", "5 Yr", "7 Yr", "10 Yr", "20 Yr", "30 Yr")
ndates<-sum(grepl("/",detail))
df_detail<-as.data.frame(matrix(nrow = ndates,ncol = length(table_names)))
names(df_detail)<-table_names
pos1<-which(grepl("/",detail))
pos2<-which(grepl("/",detail))-1
pos2<-pos2[-1]
pos2<-c(pos2,length(detail))
for(i in 1:ndates){
df_detail[i,]<-detail[pos1[i]:pos2[i]]
}
> df_detail
Date 1 Mo 3 Mo 6 Mo 1 Yr 2 Yr 3 Yr 5 Yr 7 Yr 10 Yr 20 Yr 30 Yr
1 11/01/17 1.06 1.18 1.30 1.46 1.61 1.74 2.01 2.22 2.37 2.63 2.85
2 11/02/17 1.02 1.17 1.29 1.46 1.61 1.73 2.00 2.21 2.35 2.61 2.83
3 11/03/17 1.02 1.18 1.31 1.49 1.63 1.74 1.99 2.19 2.34 2.59 2.82
4 11/06/17 1.03 1.19 1.30 1.50 1.61 1.73 1.99 2.17 2.32 2.58 2.80
5 11/07/17 1.05 1.22 1.33 1.49 1.63 1.75 1.99 2.17 2.32 2.56 2.77
6 11/08/17 1.05 1.23 1.35 1.53 1.65 1.77 2.01 2.19 2.32 2.57 2.79
7 11/09/17 1.07 1.24 1.36 1.53 1.63 1.75 2.01 2.20 2.33 2.59 2.81
8 11/10/17 1.06 1.23 1.37 1.54 1.67 1.79 2.06 2.27 2.40 2.67 2.88
9 11/13/17 1.07 1.24 1.37 1.55 1.70 1.82 2.08 2.27 2.40 2.67 2.87
10 11/14/17 1.06 1.26 1.40 1.55 1.68 1.81 2.06 2.26 2.38 2.64 2.84