如何使用“ :: before”

时间:2019-03-07 01:48:58

标签: python json selenium web-scraping webdriver

我正在尝试从PLP获取url,并访问每个元素以从PDP获取某些关键字并将其转储到json文件中。但是,该列表仅返回1个数据。我怀疑该网站正试图阻止该行动。我每月使用此程序一次,以查看新项目中添加了哪些新功能。

“ ***”之间的代码是我遇到的问题。它返回正确的值,但仅返回1个数据。如何获取更多数据?在下面的示例中,我仅获取产品名称以使其变得简单。

示例网址:“ https://store.nike.com/us/en_us/pw/mens-running-shoes/7puZ8yzZoi3

enter image description here

实际元素

<div class="exp-product-wall clearfix">
    ::before
    <div class="grid-item fullSize" data-pdpurl="https://www.nike.com/t/epic-react-flyknit-2-mens-running-shoe-459stf" data-column-index="0" data-item-index="1">
                                    <div class="grid-item-box">
                                      <div class="grid-item-content">
                                        <div class="grid-item-image">
                                          <div class="grid-item-image-wrapper sprite-sheet sprite-index-1">
                                            <a href="https://www.nike.com/t/epic-react-flyknit-2-mens-running-shoe-459stf">
                                              <img src="https://images.nike.com/is/image/DotCom/pwp_sheet2?$NIKE_PWPx3$&amp;$img0=BQ8928_001&amp;$img1=BQ8928_003&amp;$img2=BQ8928_005">

在工作代码下方

import selenium
import json
import time
import re
import string
import requests
import bs4
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

domain =  'website url goes here'

def prepare_driver(url):
    '''Returns a Firefox Webdriver.'''
    options = Options()
    # options.add_argument('-headless')
    driver = webdriver.Chrome(executable_path='location to chromedriver')
    driver.get(url)

    wait = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
        (By.CLASS_NAME, 'product-name ')))
    time.sleep(2)
    return driver

def fill_form(driver, search_argument):
    '''Finds all the input tags in form and makes a POST requests.'''
    #search_field = driver.find_element_by_id('q')
    #search_field.send_keys(search_argument)
    # We look for the search button and click it
    #driver.find_element_by_class_name('search__submit')\
        #.click()
    wait = WebDriverWait(driver, timeout=10).until(
        EC.presence_of_all_elements_located(
            (By.CLASS_NAME, 'product-name ')))

def scrape_results(driver, n_results):
    '''Returns the data from n_results amount of results.'''

    products_urls = list()
    products_data = list()

    ***for product_title in driver.find_elements_by_xpath('//div[@class="exp-gridwall-content clearfix"]'):
        products_urls.append(product_title.find_element_by_xpath(
            '//div[@class="grid-item fullSize"]').get_attribute('data-pdpurl'))***

    for url in range(0, n_results):
        if url == n_results:
            break
        url_data = scrape_product_data(driver, products_urls[url])
        products_data.append(url_data)

        return products_data

def scrape_product_data(driver, product_url):
    '''Visits an product page and extracts the data.'''

    if driver == None:
        driver = prepare_driver(product_url)

    driver.get(product_url)
    time.sleep(12)

    product_fields = dict()
    # Get the product name
    product_fields['product_name'] = driver.find_element_by_xpath(
        '//h1[@id="pdp_product_title"]').get_attribute('textContent')
      #  .text.strip('name')
    return product_fields

if __name__ == '__main__':

    try:
        driver = prepare_driver(domain)
        #fill_form(driver, 'juniole tf')
        products_data = scrape_results(driver, 2)
        products_data = json.dumps(products_data, indent=4,ensure_ascii=False) #ensure_acii => changes japanese to correct character
        with open('data.json', 'w') as f:
            f.write(products_data)
    finally:
        driver.quit()

所需的json输出:

[
    {
        "product_name": "Nike Epic React Flyknit 2",
        "descr": "The Nike Epic React Flyknit 2 takes a step up from its predecessor with smooth, lightweight performance and a bold look. An updated Flyknit upper conforms to your foot with a minimal, supportive design. Underfoot, durable Nike React technology defies the odds by being both soft and responsive, for comfort that lasts as long as you can run."
    },
    {
        "product_name": "Nike Zoom Fly SP Fast Nathan Bell",
        "descr": "The Nike Zoom Fly SP Fast Nathan Bell is part of a collaboration with artist Nathan Bell, featuring hand-drawn graphics that celebrate running as a competition with yourself. It's designed to meet the demands of your toughest tempo runs, long runs and race day with a responsive construction that turns the pressure of each stride into energy return for the next."
    }
]

1 个答案:

答案 0 :(得分:0)

您可以轻松获取带有请求的网址。我的目标是data-pdpurl属性。在selenium循环中,您可能需要添加一些对位置请求的处理。循环期间需要短暂的等待,以防止无法获得对产品的错误声明。

import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

d = webdriver.Chrome()
results = []

r = requests.get('https://store.nike.com/us/en_us/pw/mens-running-shoes/7puZ8yzZoi3')
soup = bs(r.content, 'lxml')
products = []
listings = soup.select('.grid-item')

for listing in listings:
    url = listing['data-pdpurl']
    title = listing.select_one('.product-display-name').text
    row = {'title' :title ,
           'url' : url}
    products.append(row)

for product in products:
    url = product['url']
    d.get(url)
    try:
        d.get(url)
        desc = WebDriverWait(d,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".description-preview")))
        results.append({'product_name': product['title'],
                        'descr' : desc.text})
    except Exception as e:
        print(e, url)
    finally:
        time.sleep(1)

d.quit()
print(results)