如何使用R

时间:2015-10-27 05:38:39

标签: r web-scraping wikipedia

我需要使用网页https://en.wikipedia.org/wiki/Category:Clothing_brands_by_country按国家/地区在服装零售商列表中创建一个表格。

我尝试查看各种链接,但找不到任何有效的方法。 现在的基本需求是能够从页面中提取链接,然后强制它打开并从中抓取数据。

library(XML)
library(RCurl)
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))


path<-"https://en.wikipedia.org/wiki/Category:Clothing_brands_by_country"
webpage <- getURL(path)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE, encoding=FALSE) 

1 个答案:

答案 0 :(得分:1)

想出来,不知道HTML是主要问题。 :

library(XML)
library(RCurl)
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem",     package = "RCurl")))
path<-"http://en.wikipedia.org/wiki/Category:Clothing_brands_by_country"
webpage <- getURL(path)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE, encoding=FALSE) 
q='//a[@class="CategoryTreeLabel  CategoryTreeLabelNs14 CategoryTreeLabelCategory"]'

a<-xpathSApply(pagetree, q, xmlGetAttr,'href')
t <- gsub('\\s', '', a,)
x<-data.frame(t)
x$pos<-gregexpr(pattern ='of_',x$t)
x$country<-substr(substr(x$t,x$pos,10000),4,10000)
x$url<-paste("https://en.wikipedia.org",x$t,sep="")

chk<-x[1,]
chk2<-chk$url
country<-chk$country
webpage <- getURL(chk2)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE, encoding=FALSE)
q<-'//div[@class="mw-content-ltr"]//ul/li/a'
a<-xpathSApply(pagetree, q, xmlGetAttr,'title')
n<-data.frame(a)
n$country<-country
fin<-n

for (i in 2:25)
{
  chk<-x[i,]
  chk2<-chk$url
  country<-chk$country
  webpage <- getURL(chk2)
  webpage <- readLines(tc <- textConnection(webpage)); close(tc)
  pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE, encoding=FALSE)
  q<-'//div[@class="mw-content-ltr"]//ul/li/a'
  a<-xpathSApply(pagetree, q, xmlGetAttr,'title')
  n<-data.frame(a)
  n$country<-country
  fin<-rbind(fin,n)
}