rvest-从不需要的文本中删除段落

时间:2019-02-08 19:13:03

标签: r web-scraping rvest stringr httr

我想从下面的RVest代码的输出中删除很多段落

 library(rvest)

    link <- 'https://www.duedil.com/company/gb/02666908/yorwaste-limited/financials'
    doc <- read_html(link)  %>% html_nodes("script:contains('js-financials-component')") %>% html_text()

基本上,我想只留下一些文本,在下面将其输出为json格式。我不知道该从哪里开始,有人可以建议我吗?非常感谢

{
          companyName: {"name":"Yorwaste Limited"},
          numAccounts: 1,
          accounts: [{"title":"Summary","rows":[{"label":"Reporting Period (Months)","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":12,"formattedValue":"12","delta":null}]},{"label":"Consolidated Accounts","description":"Financial statements of the parent (company) and its subsidiaries are presented as those of a single economic entity.","chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":"Y","formattedValue":"Y","delta":null}]},{"label":"Number of Employees","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":318,"formattedValue":"318","delta":64.77}]},{"label":"Turnover","description":"Revenue generated from business activities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40328232,"formattedValue":"40,328,232","delta":3.67}]},{"label":"EBITDA","description":"Earnings before interest, tax, depreciation and amortization.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":2834874,"formattedValue":"2,834,874","delta":62.78}]},{"label":"Post-tax Profit","description":"Profit generated after taxation.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":885230,"formattedValue":"885,230","delta":52.17}]},{"label":"Total Assets","description":"The value of all assets on the Balance Sheet.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40264952,"formattedValue":"40,264,952","delta":25.58}]},{"label":"Net Assets","description":"Total Assets less Total Liabilities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":10613963,"formattedValue":"10,613,963","delta":9.1}]},{"label":"Return on Capital Employed (%)","description":"Operating Profit expressed as a percentage of average Capital Employed.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":6.97,"formattedValue":"6.97","delta":null}]},{"label":"Debt to Capital (%)","description":"Total Liabilities expressed as a percentage of Total Assets.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"values":[{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":73.64,"formattedValue":"73.64","delta":null}]}]}],
          growth: [{"title":"Summary","rows":[{"label":"Reporting Period (Months)","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":12,"formattedValue":"12"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Consolidated Accounts","description":"Financial statements of the parent (company) and its subsidiaries are presented as those of a single economic entity.","chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":"Y","formattedValue":"Y"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Number of Employees","description":null,"chartable":false,"date":"31 Mar 2018","currency":null,"percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":null,"value":318,"formattedValue":"318"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Turnover","description":"Revenue generated from business activities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40328232,"formattedValue":"40,328,232"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"EBITDA","description":"Earnings before interest, tax, depreciation and amortization.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":2834874,"formattedValue":"2,834,874"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Post-tax Profit","description":"Profit generated after taxation.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":885230,"formattedValue":"885,230"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Total Assets","description":"The value of all assets on the Balance Sheet.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":40264952,"formattedValue":"40,264,952"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Net Assets","description":"Total Assets less Total Liabilities.","chartable":true,"date":"31 Mar 2018","currency":"GBP","percentage":false,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"GBP","value":10613963,"formattedValue":"10,613,963"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Return on Capital Employed (%)","description":"Operating Profit expressed as a percentage of average Capital Employed.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":6.97,"formattedValue":"6.97"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}},{"label":"Debt to Capital (%)","description":"Total Liabilities expressed as a percentage of Total Assets.","chartable":true,"date":"31 Mar 2018","currency":null,"percentage":true,"latest":{"date":"31 Mar 2018","timestamp":1522454400,"suffix":"%","value":73.64,"formattedValue":"73.64"},"cagrs":{"year1":null,"cagr3":null,"cagr5":null,"cagr10":null}}]}]

1 个答案:

答案 0 :(得分:1)

您可以简单地下载网页的源内容并使用正则表达式提取必要的代码段:

library(httr)
library(stringr)

r <- GET('https://www.duedil.com/company/gb/02666908/yorwaste-limited/financials')
q <- str_match(r, "new Widget\\([\\s\\S]*?(\\{\\s*companyName:[\\s\\S]*?\\})\\)")
d <- q[2]

parsing HTML with regex的免责声明