import csv
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"C:\Users\Pranavgupta\Downloads\chromedriver_win32 (1)\chromedriver.exe")
url = 'https://www.amazon.in'
driver.get(url)
def get_url(search_term):
"""Generate a url from search term"""
template = 'https://www.amazon.in/s?k={}&ref=nb_sb_noss_1'
search_term = search_term.replace(' ', '+')
return template.format(search_term)
url = get_url('iphone')
print(url)
#https://www.amazon.in/s?k=iphone&ref=nb_sb_noss_1
driver.get(url)
#Extracting the collection
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
len(results)
print(len(results))
#Prototype the record
item = results[0]
atag = item.h2.a
description = atag.text.strip()
print(description)
url = 'https://www.amazon.in/' + atag.get('href')
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text
print(price)
rating = item.i.text
print(rating)
review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
print(review_count)
#Generalize the pattern
def extract_record(item):
"""Extract and return data from a single record"""
#description and url
atag = item.h2.a
description = atag.text.strip()
url = 'https://www.amazon.in/' + atag.get('href')
#price
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text
print(price)
#rank and rating
rating = item.i.text
print(rating)
review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
print(review_count)
result = (description, price, rating, review_count, url)
return result
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for itme in results:
records.append(extract_record(item))
#Error Handling
def extract_record(item):
"""Extract and return data from a single record"""
#description and url
atag = item.h2.a
description = atag.text.strip()
url = 'https://www.amazon.in' + atag.get('href')
try:
#price
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text
print(price)
except AttributeError:
return
try:
#rank and rating
rating = item.i.text
print(rating)
review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
print(review_count)
except AttributeError:
rating = ''
review = ''
result = (description, price, rating, review_count, url)
return result
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for itme in results:
record = extract_record(item)
if record:
records.append(record)
records[0]
for row in records:
print(row[1])
#Getting the next page
def get_url(search_term):
"""Generate a url from search term"""
template = 'https://www.amazon.in/s?k={}&ref=nb_sb_noss_1'
search_term = search_term.replace(' ', '+')
#add term query to url
url = template.format(search_term)
#add page query placeholder
url += '&page{}'
return url
#Putting it all together
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"C:\Users\Pranavgupta\Downloads\chromedriver_win32 (1)\chromedriver.exe")
#Getting the next page
def get_url(search_term):
"""Generate a url from search term"""
template = 'https://www.amazon.in/s?k={}&ref=nb_sb_noss_1'
search_term = search_term.replace(' ', '+')
#add term query to url
url = template.format(search_term)
#add page query placeholder
url += '&page{}'
return url
def extract_record(item):
"""Extract and return data from a single record"""
#description and url
atag = item.h2.a
description = atag.text.strip()
url = 'https://www.amazon.in' + atag.get('href')
try:
#price
price_parent = item.find('span', 'a-price')
price = price_parent.find('span', 'a-offscreen').text
print(price)
except AttributeError:
return
try:
#rank and rating
rating = item.i.text
print(rating)
review_count = item.find('span', {'class': 'a-size-base', 'dir': 'auto'}).text
print(review_count)
except AttributeError:
rating = ''
review = ''
result = (description, price, rating, review_count, url)
return result
def main(search_term):
"""Run main program routine """
#startup the webdriver
driver = webdriver.Chrome(executable_path=r"C:\Users\Pranavgupta\Downloads\chromedriver_win32 (1)\chromedriver.exe")
record = []
url = get_url(search_term)
for page in range(1, 21):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source,'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
for item in results:
record = extract_record(item)
if record:
records.append(record)
driver.close()
#save data to csv file
with open('results.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'URL'])
writer.writerows(records)
main = ('iphone')
因此,在运行程序后,当在受尊重的文件夹中生成 CSV 文件时,将看不到必须抓取的产品,例如必须从亚马逊抓取 iphone,但 iphone 值的第一个结果以 CSV 格式出现相反,页面上的所有产品值都必须出现。