import csv
import requests
from bs4 import BeautifulSoup
urls = ["https://www.medplusmedicalsupply.com/exam-and-diagnostic?product_list_limit=25", "https://www.medplusmedicalsupply.com/exam-and-diagnostic?p=2&product_list_limit=25"]
for url in urls:
html = requests.get(urls).text
soup = BeautifulSoup(html, "html.parser")
products = soup.findAll('div', {"class": "product details product-item-details"})
all_product = []
for product in products:
product_details = dict()
product_details['name'] = product.find('a').text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['brand'] = product.find('div', {'class': 'value'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['packaging'] = product.find('div', {'class': 'pack'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['availability'] = product.find('div', {'class': 'avail pack'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['price'] = product.find('span', {'class': 'price'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['packaging'] = product_details['packaging'][9:] # here we're cutting redundant part of string "Brand: \n\n"
product_details['availability'] = product_details['availability'][16:] # here we're cutting redundant part of string "Availability: \n\n"
all_product.append(product_details)
print(all_product)
with open('products.csv', 'w+') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(['Name', 'Brand', 'Packaging', 'Availability', 'Price'])
for product in all_product:
writer.writerow([product['name'], product['brand'],product['packaging'], product['availability'], product['price']])
这是尝试两个URL时的错误代码:
InvalidSchema: No connection adapters were found for '['https://www.medplusmedicalsupply.com/exam-and-diagnostic?product_list_limit=25', 'https://www.medplusmedicalsupply.com/exam-and-diagnostic?p=2&product_list_limit=25']'
我一直想知道是否有一种方法可以生成无限页面,而不是将URL手动放置在urls变量中。我要抓取的网站上有成千上万个具有许多页面的产品。感谢您的帮助!
答案 0 :(得分:1)
您原来有
btn.setOnClickListener(new View.OnClickListener() {
@Override
public void onClick(View view) {
SharedPreferences settings = getSharedPreferences("prefs", 0);
SharedPreferences.Editor editor = settings.edit();
editor.putBoolean("firstRun", false);
editor.apply();
finish();
}
});
不要将html = requests.get(urls).text
中的urls
替换为request.get
,因为从现在开始,您尝试请求整个数组而不是每个数组
url
答案 1 :(得分:0)
只要URL相同,就可以使用数字循环并传递页码
for i in range(1,6):
url = "https://www.medplusmedicalsupply.com/exam-and-diagnostic?p="+str(i)+"&product_list_limit=25"
print(url)
#Do requests stuff here
产生:
https://www.medplusmedicalsupply.com/exam-and-diagnostic?p=1&product_list_limit=25
https://www.medplusmedicalsupply.com/exam-and-diagnostic?p=2&product_list_limit=25
https://www.medplusmedicalsupply.com/exam-and-diagnostic?p=3&product_list_limit=25
https://www.medplusmedicalsupply.com/exam-and-diagnostic?p=4&product_list_limit=25
https://www.medplusmedicalsupply.com/exam-and-diagnostic?p=5&product_list_limit=25
注意:它可能只是粘贴而已,但是看起来您的代码中可能存在缩进问题,可能会影响每个循环中发生的事情
答案 2 :(得分:0)
您几乎完成了代码,但如果必须访问多网址并保存所有数据,则应遵循此步骤。
我的完整代码
import csv
import requests
from bs4 import BeautifulSoup
urls = ["https://www.medplusmedicalsupply.com/exam-and-diagnostic?product_list_limit=25",
"https://www.medplusmedicalsupply.com/exam-and-diagnostic?p=2&product_list_limit=25"]
all_product = []
for index,url in enumerate(urls):
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
products = soup.findAll('div', {"class": "product details product-item-details"})
all_product.append(products)
resultset = []
for products in all_product:
for product in products:
product_details = dict()
product_details['name'] = product.find('a').text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['brand'] = product.find('div', {'class': 'value'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['packaging'] = product.find('div', {'class': 'pack'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['availability'] = product.find('div', {'class': 'avail pack'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['price'] = product.find('span', {'class': 'price'}).text.strip('\n\r\t": ').strip('\n\r\t": ').strip('\n\r\t": ')
product_details['packaging'] = product_details['packaging'][9:] # here we're cutting redundant part of string "Brand: \n\n"
product_details['availability'] = product_details['availability'][16:] # here we're cutting redundant part of string "Availability: \n\n"
resultset.append(product_details)
with open('products.csv', 'w+',,newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(['Name', 'Brand', 'Packaging', 'Availability', 'Price'])
for product in resultset:
writer.writerow([product['name'], product['brand'],product['packaging'], product['availability'], product['price']])