在朋友的帮助下,我已经有了这段代码。我已经获得了网站上的所有链接。我想获取名称,名称,价格,图片,产品说明以及产品链接。仅当我们单击产品时,描述的产品才会出现。
我是Python的初学者。
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
merk = []
price = []
pic = []
description = []
while (response.url == expected_url):
#print("GET {0}".format(expected_url))
soup = BeautifulSoup(response.read(), "html.parser")
products = soup.find("div",{"id":"product-list-grid"})
for i in products:
data = products.findAll("div",{"class":"product-item"})
for j in range(0, len(data)):
link.append(data[j]["data-eec-href"])
count += 1
expected_url = url % count
response = get_url(expected_url)
print(len(link))
"""
import csv
dataset=zip(link, merk, name, pic, price, description)
with open("foundation_sociolla.csv","w", newline='') as csvfile:
writer=csv.writer(csvfile)
header=['link', 'merk', 'name', 'pic', 'price', 'description']
writer.writerow(header)
writer.writerows(dataset)
"""
答案 0 :(得分:1)
您需要向URL发出请求。解析该请求的内容并提取所需的数据。
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
make = []
price = []
pic = []
description = []
while response.url == expected_url:
soup = BeautifulSoup(response.read(), "html.parser")
for product in soup.select("div.product-item"):
product_url = (product['data-eec-href'])
link.append(product_url)
product_response = get_url(product_url)
product_soup = BeautifulSoup(product_response.read(), "html.parser")
product_pic = product_soup.select('img#bigpic')[0]['src']
pic.append(product_pic)
product_price = product_soup.select('span#our_price_display')[0].text.strip()
price.append(product_price)
product_name = product_soup.select('div.detail-product-logo p')[0].text.strip()
name.append(product_name)
product_make = product_soup.select('div.detail-product-logo h3')[0].text.strip()
make.append(product_make)
product_description = product_soup.select('div#Details article')[0].text.strip()
description.append(product_description)
print(product_url, product_pic, product_price, product_name, product_make, product_description)
count += 1
expected_url = url % count
response = get_url(expected_url)
但是,如果您要抓取很多页面,那么最好使用Scrapy https://scrapy.org/