将多个网页抓取到单个csv中

时间:2019-06-10 05:40:44

标签: python python-3.x web-scraping

我希望能够将来自多个New Egg网页的数据网络抓取到单个csv文件中。

我目前能够将一个New Egg网页中的数据网络抓取到一个csv文件中,但是我希望一次对多个页面进行网络抓取。

from bs4 import BeautifulSoup

import requests

import bs4

from urllib.request import urlopen as uReq

from bs4 import BeautifulSoup as soup



#my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48?PageSize=96'
my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48/Page-1?PageSize=96'



#opening up connection and grabbing the page

uClient = uReq(my_url)

page_html = uClient.read()

uClient.close()



#html parsing

page_soup = soup(page_html, "html.parser")



#grabs each product

containers = page_soup.findAll("div", {"class":"item-container"})



filename = "99FINAL.csv"

f = open(filename, "w")

headers = "Brand, Title, Shipping, Price\n"



f.write(headers)

# only for information, not used in url
page = 0 

while True:

    print('---', page, '---')

    r = requests.get(my_url)

    soup = BeautifulSoup(r.content, "html.parser")

    # String substitution for HTML




    for container in containers:

        brand_container = container.findAll("a", {"class":"item-title"})

        brand = brand_container[0].text



        title_container = container.findAll("a", {"class":"item-title"})

        title = title_container[0].text



        shipping_container = container.findAll("li", {"class":"price-ship"})

        shipping = shipping_container[0].text.strip()



        price = container.findAll("li", {"class":"price-current"})

        price = price[0]
        pricing_container = price.findAll("strong")

        pricing = pricing_container[0].text

        cents_container = price.findAll("sup")
        centing = cents_container[0].text




        print("brand: " + brand.partition(' ')[0])

        print("title: " + title)

        print("shipping: " + shipping)

        print("pricing: " + pricing)

        print("centing: " + centing)



        f.write(brand.partition(' ')[0] + "," + title.replace(",","|") + "," + shipping.partition(' ')[0] + "," + pricing.replace(",","")+centing + "\n")

    break
     # link to next page

next_page = soup.find("button", {"title": "Next"})

if next_page:
    my_url = next_page.get("href")
    page += 1
else:
break # exit `while True`for "Page" in my_url 
f.close()

该算法已成功对第一页进行网页抓取,但是切换到下一页时存在问题。

我得到的错误是: “ MissingSchema:无效的URL'None':未提供模式。也许您是说http://None?”

2 个答案:

答案 0 :(得分:1)

使用pandas

import requests
from bs4 import BeautifulSoup
import  pandas as pd

page = 1
data = []

url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48/Page-{}?PageSize=96'

while True:
    # format website url with updated page number
    my_url = url.format(str(page))
    r = requests.get(my_url)
    soup = BeautifulSoup(r.content, "html.parser")
    containers = soup.find_all("div", {"class": "item-container"})
    for container in containers:

        brand_container = container.findAll("a", {"class":"item-title"})
        brand = brand_container[0].text
        title_container = container.findAll("a", {"class":"item-title"})
        title = title_container[0].text
        shipping_container = container.findAll("li", {"class":"price-ship"})
        shipping = shipping_container[0].text.strip()
        price = container.findAll("li", {"class":"price-current"})
        price = price[0]
        pricing_container = price.findAll("strong")

        pricing = pricing_container[0].text
        cents_container = price.findAll("sup")
        centing = cents_container[0].text

        print("brand: " + brand.partition(' ')[0])
        print("title: " + title)
        print("shipping: " + shipping)
        print("pricing: " + pricing)
        print("centing: " + centing)
        product = {"brand":brand,"title":title,"shipping":shipping,"pricing":pricing,"centing":centing}
        data.append(product)
        break

    next_page = soup.find("button", {"title": "Next"})

    if next_page:
        page += 1
    else:
        break

print(data)
df = pd.DataFrame(data)
print(df)
# it will save csv file in current project directory with product.csv file name
df.to_csv("product.csv")

报废前五项产品详细信息:

[{'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}, {'brand': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'title': 'ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card', 'shipping': 'Free Shipping', 'pricing': '509', 'centing': '.99'}]

如果不安装熊猫,请安装它:

pip3 install pandas

答案 1 :(得分:1)

您可以使用while循环来保持抓取,直到加载带有禁用的“下一步”按钮的页面:

from bs4 import BeautifulSoup as soup
import requests, csv, re
def get_products(d):
   full_price = [i.text for i in getattr(d.find('li', {'class':'price-current'}), 'find_all', lambda _:[])(re.compile('strong|sup'))]
   return [getattr(d.find('a', {'class':'item-brand'}), 'img', {'title':'N/A'})['title'], d.find('a', {'class':'item-title'}).text, *full_price, d.find('li', {'class':'price-ship'}).text]

rows, count, header = [], 1, ['brand', 'title', 'pricing', 'centing',  'shipping']
while True:
   d = soup(requests.get(f'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48/Page-{count}?PageSize=96').text, 'html.parser')
   rows.extend([get_products(i) for i in d.find_all('div', {'class':re.compile('item\-container')})])
   if 'disabled' in d.find('button', {'title':'Next'}).attrs:
      break
   count += 1

with open('graphics_cards.csv', 'w') as f:
   write = csv.writer(f)
   write.writerows([header, *rows])

输出(前五个产品)

brand,title,pricing,centing,shipping
ASUS,ASUS ROG Strix GeForce RTX 2070 DirectX 12 ROG-STRIX-RTX2070-8G-GAMING 8GB 256-Bit GDDR6 PCI Express 3.0 HDCP Ready Video Card,509,.99,"
    Free Shipping
"
MSI,MSI Radeon RX 580 DirectX 12 RX 580 ARMOR 8G OC 8GB 256-Bit GDDR5 PCI Express x16 HDCP Ready CrossFireX Support Video Card,194,.99,"
    Free Shipping
"
GIGABYTE,"GIGABYTE GeForce GTX 1660 GAMING OC 6G Graphics Card, 3 x WINDFORCE Fans, 6GB 192-Bit GDDR5, GV-N1660GAMING OC-6GD Video Card",229,.99,"
    Free Shipping
"
Sapphire Tech,SAPPHIRE NITRO+ Radeon RX Vega 64 DirectX 12 100410NT+SR 8GB 2048-Bit HBM2 PCI Express 3.0 Video Card,399,.99,"
    Free Shipping
"
ASUS,ASUS Radeon RX 580 O4G Dual-fan OC Edition GDDR5 DP HDMI DVI VR Ready AMD Graphics Card (DUAL-RX580-O4G),189,.99,"
    Free Shipping
"