使用Python使用AJAX的网站刮下Beautiful-Soup

时间:2018-03-21 10:00:25

标签: python

我想从页面中删除产品名称,价格和图片来源,但这里只显示有限的结果是我想要抓取的网站https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085 我也想刮掉填充物的复选框,但不知道如何刮掉所有结果,但只有10个结果显示我应该怎么做才能刮掉完整的结果。如果我删除标题而不是显示完整的名称和价格结果但是图像来源并没有刮掉{

headers = {"Accept-Language": "en-US,en;q=0.5",
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0",
           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
           "Referer": "http://thewebsite.com",
           "Connection": "keep-alive"}



    scrap = requests.get('https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085',headers=headers)


    # Grab title-artist classes and store in recordList
    content = BeautifulSoup(scrap.text, "html.parser")



    if content.findAll("div", {"class": "search-result-gridview-item-wrapper"}) != None:
        products = content.findAll("div", {"class": "search-result-gridview-item-wrapper"})

        for product in products:
            name = product.find("div", {"class": "search-result-product-title gridview"})
            title = name.find('a').text

            price = product.find("div", {"class": "search-result-productprice gridview enable-2price-2"})
            p = price.text

            image=product.find("div",{"class":"display-inline-block pull-left prod-ProductCard--Image"})
            img = image.find("img", {"class": "Tile-img"})['src']

            hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
            href=hreff.find('a')['href']


    if content.findAll("div", {"class": "search-result-listview-item clearfix"}) != None:
        products = content.findAll("div", {"class": "search-result-listview-item clearfix"})

        for product in products:

            if product.find("span",{"class":"Price-group"}) !=None:
                name = product.find("a", {"class": "product-title-link"}).text
                price = product.find("span", {"class": "Price-group"}).text

                image = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
                img = image.find("img", {"class": "Tile-img"})['src']

                hreff = product.find("div", {"class": "display-inline-block pull-left prod-ProductCard--Image"})
                href = hreff.find('a')['href']

}

1 个答案:

答案 0 :(得分:0)

请参阅以下示例代码以从本网站获取数据。我刚刚添加了交互,但这应该给你一般的想法 - (你需要使用浏览器的inspect元素功能来找到xpaths)

from selenium import webdriver
from selenium.webdriver.common.by import By

browser = webdriver.Chrome("./chromedriver") #download chromebrowser
browser.get("https://www.walmart.com/browse/cell-phones/unlocked-phones/1105910_1073085") #open page in browser
outDF = pd.DataFrame(columns=['prodname', 'imageurl', 'minprice', 'maxprice', 'actualprice']) #template of data
prices = browser.find_elements(By.XPATH, "//div[contains(@class, 'price-main-block')]") #finding prices
product = browser.find_elements(By.XPATH, "//a[contains(@class, 'product-title-link')]") #product name
images = browser.find_elements(By.XPATH, "//img[contains(@class, 'Tile-img')]") #images

#getting actual prices/ranges
for i in range(len(product)):
    prodname = product[i].get_attribute("aria-label")
    imageurl = images[i].get_attribute("src")
    pricerange = prices[i].find_elements_by_xpath(".//span[contains(@class, 'Price-group')]")
    if len(pricerange)>1:
        minprice = pricerange[0].get_attribute("title")
        maxprice = pricerange[1].get_attribute("title")
        actualprice = None
    else:
        minprice = None
        maxprice = None
        actualprice = pricerange[0].get_attribute("title")
    thisline = [prodname, imageurl, minprice, maxprice, actualprice]
    outDF.loc[outDF.shape[0]] =  thisline

#Reading next pages    
next = True
while next:
    try:
        #clicking next button
        browser.find_element(By.XPATH, "//button[contains(@class, 'paginator-btn paginator-btn-next')]").click()
        #repeating process
        prices = browser.find_elements(By.XPATH, "//div[contains(@class, 'price-main-block')]")
        product = browser.find_elements(By.XPATH, "//a[contains(@class, 'product-title-link')]")
        images = browser.find_elements(By.XPATH, "//img[contains(@class, 'Tile-img')]")
        for i in range(len(product)):
            prodname = product[i].get_attribute("aria-label")
            imageurl = images[i].get_attribute("src")
            pricerange = prices[i].find_elements_by_xpath(".//span[contains(@class, 'Price-group')]")
            if len(pricerange)>1:
                minprice = pricerange[0].get_attribute("title")
                maxprice = pricerange[1].get_attribute("title")
                actualprice = None
            else:
                minprice = None
                maxprice = None
                actualprice = pricerange[0].get_attribute("title")
                thisline = [prodname, imageurl, minprice, maxprice, actualprice]
                outDF.loc[outDF.shape[0]] =  thisline
    except:
        print("Something went wrong")
        next = False

browser.quit()