我试图从亚马逊网页上抓取信息。 但问题是某些产品有2个价格标签。价格位于2个不同的标签中。
Example product 1 has price tag : class="sx-price-whole" and
product 2 has price tag: class="sx-price-whole"
product 3 has 2 price tag : class="sx-price-whole" and class="sx-price-whole"
如果我根据代码提取价格
price = xpathSApply(check, "//span[@class = 'sx-price-whole'] | //span[@class ='a-size-base a-color-base']")
它给了我一些产品有2个价格,当我创建数据框时,我不知道哪个产品属于哪个产品。矢量价格的长度与产品标题的长度不同。
如果某些产品有2个价格标签,我怎么能编写一个自动只需一个价格的代码。请帮帮我!
# Gathering data from mobile phone.
library(XML)
library(stringr)
library(RCurl)
baseURL = "https://www.amazon.com/s/ref=sr_nr_n_0?fst=p90x%3A1%2Cas%3Aoff&rh=n%3A2335752011%2Cn%3A2407749011%2Ck%3Aapple&keywords=apple&ie=UTF8&qid=1519785465&rnid=2335753011"
handle = getCurlHandle(useragent = str_c(R.version$platform, R.version.string, sep =","), httpheader = c(from = "ed@datacollection.com"), followlocation = T, cookiefile ="")
check = htmlParse(getURL(baseURL, curl = handle))
class(check)
check
nextPage =xpathApply(check, "//a[@title = 'Next Page']/@href",as.character)[[1]]
originalURL = "https://www.amazon.com"
nextpage_link = str_c(originalURL, nextPage)
#extract the product info
productInfo = xpathApply(check, "//a/h2", xmlValue)
productInfo
class(productInfo)
length(productInfo)
#write the function for extraction
extract_product_info = function(x){
info =xpathApply(x, "//a/h2", xmlValue)
return(info)
}
#extract the image link of product
image = xpathApply(check, "//div[@class ='a-fixed-left-grid-col a-col-left']//img/@src", as.character)
length(image)
class(image)
linkImage = unlist(image)
length(linkImage)
# extract the price of the product
price = xpathSApply(check, "//span[@class = 'sx-price-whole'] | //span[@class ='a-size-base a-color-base']")
length(unlist(price))
# extract the star of product
star_div = unlist(xpathSApply(check , "//div[@class ='a-row a-spacing-mini']//span[@class ='a-declarative']//span", xmlValue))
class(star_div)
star_div
#write the whole function
extract_star_product = function(x){xpathSApply(x , "//div[@class ='a-row a-spacing-mini']//span[@class ='a-declarative']//span", xmlValue)}
#extract the number of ratef
numberofrate =xpathSApply(check , "//div[@class ='a-row a-spacing-mini']/a", xmlValue)
#write the whole function
extract_numberofrate = function(x){xpathSApply(x , "//div[@class ='a-row a-spacing-mini']/a", xmlValue)}
#extract the ASIN
ASIN = xpathSApply(check , "//div[@class ='a-row a-spacing-mini']//span[@class ='a-declarative']/ancestor::span/@name")
#write the whole function
extract_asin = function(x){xpathSApply(x , "//div[@class ='a-row a-spacing-mini']//span[@class ='a-declarative']/ancestor::span/@name")}