我想从页面中删除产品名称,价格和图片来源,但这里只显示有限的结果是我想要抓取的网站https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085 我也想刮掉填充物的复选框,但不知道如何刮掉所有结果,但只有10个结果显示我应该怎么做才能刮掉完整的结果。如果我删除标题而不是显示完整的名称和价格结果但是图像来源并没有刮掉{
headers = {"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "http://thewebsite.com",
"Connection": "keep-alive"}
scrap = requests.get('https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085',headers=headers)
# Grab title-artist classes and store in recordList
content = BeautifulSoup(scrap.text, "html.parser")
if content.findAll("div", {"class": "search-result-gridview-item-wrapper"}) != None:
products = content.findAll("div", {"class": "search-result-gridview-item-wrapper"})
for product in products:
name = product.find("div", {"class": "search-result-product-title gridview"})
title = name.find('a').text
price = product.find("div", {"class": "search-result-productprice gridview enable-2price-2"})
p = price.text
image=product.find("div",{"class":"display-inline-block pull-left prod-ProductCard--Image"})
img = image.find("img", {"class": "Tile-img"})['src']
hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
href=hreff.find('a')['href']
if content.findAll("div", {"class": "search-result-listview-item clearfix"}) != None:
products = content.findAll("div", {"class": "search-result-listview-item clearfix"})
for product in products:
if product.find("span",{"class":"Price-group"}) !=None:
name = product.find("a", {"class": "product-title-link"}).text
price = product.find("span", {"class": "Price-group"}).text
image = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
img = image.find("img", {"class": "Tile-img"})['src']
hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
href = hreff.find('a')['href']
}
答案 0 :(得分:0)
请参阅以下示例代码以从本网站获取数据。我刚刚添加了交互,但这应该给你一般的想法 - (你需要使用浏览器的inspect元素功能来找到xpaths)
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome("./chromedriver") #download chromebrowser
browser.get("https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085") #open page in browser
outDF = pd.DataFrame(columns=['prodname', 'imageurl', 'minprice', 'maxprice', 'actualprice']) #template of data
prices = browser.find_elements(By.XPATH, "//div[contains(@class, 'price-main-block')]") #finding prices
product = browser.find_elements(By.XPATH, "//a[contains(@class, 'product-title-link')]") #product name
images = browser.find_elements(By.XPATH, "//img[contains(@class, 'Tile-img')]") #images
#getting actual prices/ranges
for i in range(len(product)):
prodname = product[i].get_attribute("aria-label")
imageurl = images[i].get_attribute("src")
pricerange = prices[i].find_elements_by_xpath(".//span[contains(@class, 'Price-group')]")
if len(pricerange)>1:
minprice = pricerange[0].get_attribute("title")
maxprice = pricerange[1].get_attribute("title")
actualprice = None
else:
minprice = None
maxprice = None
actualprice = pricerange[0].get_attribute("title")
thisline = [prodname, imageurl, minprice, maxprice, actualprice]
outDF.loc[outDF.shape[0]] = thisline
#Reading next pages
next = True
while next:
try:
#clicking next button
browser.find_element(By.XPATH, "//button[contains(@class, 'paginator-btn paginator-btn-next')]").click()
#repeating process
prices = browser.find_elements(By.XPATH, "//div[contains(@class, 'price-main-block')]")
product = browser.find_elements(By.XPATH, "//a[contains(@class, 'product-title-link')]")
images = browser.find_elements(By.XPATH, "//img[contains(@class, 'Tile-img')]")
for i in range(len(product)):
prodname = product[i].get_attribute("aria-label")
imageurl = images[i].get_attribute("src")
pricerange = prices[i].find_elements_by_xpath(".//span[contains(@class, 'Price-group')]")
if len(pricerange)>1:
minprice = pricerange[0].get_attribute("title")
maxprice = pricerange[1].get_attribute("title")
actualprice = None
else:
minprice = None
maxprice = None
actualprice = pricerange[0].get_attribute("title")
thisline = [prodname, imageurl, minprice, maxprice, actualprice]
outDF.loc[outDF.shape[0]] = thisline
except:
print("Something went wrong")
next = False
browser.quit()