嗨,我正尝试在以下网站上抓取有关沙发的数据。我已经写了这个履带;随附的代码如下。搜寻器会在第一页上不断向我返回相同的30个搜索结果。似乎搜寻器无法转到下一页。
## Web scraper code to search for product images and their
associated price from e-commerce websites.
# Inputs required:
# 1) A CSV file containing a list of web-page links containing the
results for a particular query
# 2) File path where the images and the product prices have to be
stored
import requests
import csv
from bs4 import BeautifulSoup as bs
import urllib.request as urdown
import pandas as pd
import validators
#r_url = re.compile(r"^https?:")
data_folder = "/Users/adasgupta/Documents/Softwares/Pycharm/Data/"
#modify this to the path where downloaded product images and prices
are to be stored
data_extn = ".jpg"
data_extn2 = ".txt"
data_prices = "prices_couches.csv"
prod_prices_mat = pd.DataFrame()
# Reading the list of URLs file
with open('horchow_couches.csv') as csv_file: #modify this to read the
relevant CSV file containing the URLs of all the results pages
csv_reader = csv.reader(csv_file, delimiter = ',')
tot_count = 0 #modify this to name the product images from a
particular starting number
for row in csv_reader:
#print(f'\t Reading {row[0]}')
url_list = row[0]
url_list = url_list.replace(u'\ufeff', '')
page = requests.get(url_list, allow_redirects=False)
if(page.status_code==200):
soup = bs(page.content, 'html.parser')
count = 0
#home_cl = soup.find_all("div", {"class": "mobile-
header"})
home_a_cl = soup.find_all("a", {"class": "mobile-logo"})
home_page = home_a_cl[0].get('href')
list_cl = soup.find_all("li",{"class": "category-item"})
print("No of results found = ", len(list_cl))
for i in range(len(list_cl)):
list_a_cl = list_cl[i].find_all("a", {"class":
"product-thumbnail-image"})
if (len(list_a_cl)!=0):
page_link = home_page + list_a_cl[0].get('href')
print(page_link)
## visiting each of the link_pages to extract necessary
information
prod_page = requests.get(page_link)
soup2 = bs(prod_page.content, 'html.parser')
if(prod_page.status_code==200):
## extracting price info
prod_price =
soup2.find(itemprop="price").get_text()
prod_price = prod_price.replace('\t', '')
prod_price = prod_price.replace('\n', '')
prod_price = prod_price.replace('$', '')
prod_price = prod_price.replace(',', '')
prod_price = float(prod_price)
print(prod_price)
## extracting image info
prod_grid1 = soup2.find_all("div",{"class": "hero-
zoom-frame gutter-top-half mobile-grid-100 tablet-
grid-100 grid-100 grid-parent line-item-data"})
prod_grid2 = prod_grid1[0].find_all("img",
{"itemprop": "image"})
prod_im = prod_grid2[0].get("src")
prod_met_data = prod_grid1[0].find_all("div",
{"itemprop": "description"})
im_no = str(1+ count + tot_count)
str_data = prod_met_data[0]
f = open(data_folder + im_no + data_extn2, 'w')
print(str_data, file=f)
f.close()
prod_prices_mat =
prod_prices_mat.append([[prod_im, im_no,
prod_price]], ignore_index=True)
# saving the image
try:
urdown.urlretrieve(prod_im,
(data_folder+im_no+data_extn))
except:
continue
count += 1
print(count)
tot_count += count
print(f'Parsed {tot_count}.')
## writing product_prices to a csv file
prod_prices_mat.to_csv((data_folder+data_prices), sep = ',',
encoding='utf-8')
基本上,我不断地从第1页获取所有数据,以获取连续页面的所有URL。