产品没有被网络抓取

时间:2021-01-05 10:08:21

标签: python html selenium-webdriver web-scraping amazon

import csv
from bs4 import BeautifulSoup
from selenium import webdriver

driver = webdriver.Chrome(executable_path=r"C:\Users\Pranavgupta\Downloads\chromedriver_win32 (1)\chromedriver.exe")

url = 'https://www.amazon.in'
driver.get(url)

def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.in/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term)

url = get_url('iphone')
print(url)
#https://www.amazon.in/s?k=iphone&ref=nb_sb_noss_1

driver.get(url)

#Extracting the collection 
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
len(results)
print(len(results))

#Prototype the record 
item = results[0]
atag = item.h2.a
description = atag.text.strip()
print(description)

url = 'https://www.amazon.in/' + atag.get('href')

price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text
print(price)

rating = item.i.text
print(rating)

review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
print(review_count)

#Generalize the pattern
def extract_record(item):
        """Extract and return data from a single record"""

        #description and url 
        atag = item.h2.a
        description = atag.text.strip()
        url = 'https://www.amazon.in/' + atag.get('href')

        #price 
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
        print(price)

        #rank and rating 
        rating = item.i.text
        print(rating)
        review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
        print(review_count)

        result = (description, price, rating, review_count, url)
        
        return result 

records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for itme in results:
    records.append(extract_record(item))

#Error Handling 
def extract_record(item):
        """Extract and return data from a single record"""

        #description and url 
        atag = item.h2.a
        description = atag.text.strip()
        url = 'https://www.amazon.in' + atag.get('href')

        try:
            #price 
            price_parent = item.find('span', 'a-price')
            price = price_parent.find('span', 'a-offscreen').text
            print(price)
        except AttributeError:
            return

        try:
            #rank and rating 
            rating = item.i.text
            print(rating)
            review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
            print(review_count)
        except AttributeError:
            rating = ''
            review = ''

        result = (description, price, rating, review_count, url)
        
        return result 

records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for itme in results:
    record = extract_record(item)
    if record:
        records.append(record)

records[0]

for row in records:
    print(row[1])

#Getting the next page 
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.in/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    
    #add term query to url 
    url = template.format(search_term)

    #add page query placeholder 
    url += '&page{}'

    return url

#Putting it all together 
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"C:\Users\Pranavgupta\Downloads\chromedriver_win32 (1)\chromedriver.exe")

#Getting the next page 
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.in/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    
    #add term query to url 
    url = template.format(search_term)

    #add page query placeholder 
    url += '&page{}'

    return url

def extract_record(item):
        """Extract and return data from a single record"""

        #description and url 
        atag = item.h2.a
        description = atag.text.strip()
        url = 'https://www.amazon.in' + atag.get('href')

        try:
            #price 
            price_parent = item.find('span', 'a-price')
            price = price_parent.find('span', 'a-offscreen').text
            print(price)
        except AttributeError:
            return

        try:
            #rank and rating 
            rating = item.i.text
            print(rating)
            review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
            print(review_count)
        except AttributeError:
            rating = ''
            review = ''

        result = (description, price, rating, review_count, url)
        
        return result 

def main(search_term):
    """Run main program routine """
    #startup the webdriver
    driver = webdriver.Chrome(executable_path=r"C:\Users\Pranavgupta\Downloads\chromedriver_win32 (1)\chromedriver.exe") 

    record = []
    url = get_url(search_term)

    for page in range(1, 21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source,'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})

        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)

driver.close()

#save data to csv file 
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'URL'])
    writer.writerows(records)

main = ('iphone')

因此,在运行程序后,当在受尊重的文件夹中生成 CSV 文件时,将看不到必须抓取的产品,例如必须从亚马逊抓取 iphone,但 iphone 值的第一个结果以 CSV 格式出现相反,页面上的所有产品值都必须出现。

enter image description here 显示 CSV 文件

0 个答案:

没有答案