如何使用Python和BeautifulSoup将数据抓取到下一页

时间:2020-04-16 09:05:29

标签: python beautifulsoup

我想使用python 3.5和BeautifulSoup抓取数据https://www.arduinothai.com/category/2/arduino-compatible-board。我可以成功地在第一页上抓取数据,但无法从其他页面上抓取数据。这是我的代码

import re
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from requests import get

URL='https://www.arduinothai.com/category/2/arduino-compatible-board'
Request=requests.get(URL)
soups=BeautifulSoup(Request.text,'lxml')

''' Find All page in website
Count_Next_Pages = soups.find_all('span','tsk-all')
TotalProduct = int(Count_Next_Pages[1].text)
TotalProductPerPage = 40
TotalPages = (round(TotalProduct/TotalProductPerPage))

count=0
for i in range(int(TotalPages)):
    count+=1
    i='https://www.arduinothai.com/category/2/arduino-compatible-board?tskp='+str(count)
    Request_Data=requests.get(i)
    Soups_Data=BeautifulSoup(Request_Data.text,'lxml')
    AllProduct=Soups_Data.find_all('div',class_='productDetail')

for x in AllProduct:
     AllProductDeatil = x.find('a').get("gaeepd") 
     IDProductLink = json.loads(AllProductDeatil)["id"]  

    #Scrape ProductID
     ProductID = x.find('span','code').get_text(strip=True)
     pattren = r'[A-Z]{2}\d{5}|\d{5}|....\d{5}'          
     regex = re.compile(pattren)
     ProDuctIDResult = regex.findall(ProductID)  
     ProductIDStr = ConvertListToStr(ProDuctIDResult)
     ProductIDAll.append(ProductIDStr)   

   #Scrape Stock    
     URL_Prefix =requests.get('https://www.arduinothai.com/product/'+str(IDProductLink))
     SoupStock = BeautifulSoup(URL_Prefix.text, 'lxml')   
     ChkStock = SoupStock.find('span', class_='num').text
     StockOfProduct.append(ChkStock)

     if((ProductCategory_jsonData==('Single Set')) or (ProductCategory_jsonData==('Triple Set')) or (ProductCategory_jsonData==('STM32'))):

         ListOfProduct.append((ProductIDStr, NameOfProduct, PriceOfProduct, OldProPricesStr, ChkStock, Link_URL, ProductCategory_jsonData))

         data_df = pd.DataFrame({
                'ProductID': ProductIDAll,
                'ProdcutName':Productname,
                'Productprice':Productprice,
                'OldProductPrice': OldProductPrice,
                'StockOfProduct': StockOfProduct,
                'Link': LinkProduct,
                'Category':CategoryProduct
         })

     df=pd.DataFrame(ListOfProduct, columns=['ProductID', 'ProductName','Discount','Price','Stock','Link','TypeOfProduct'])
     pd.set_option('display.max_rows', df.shape[0]+1)
df

1 个答案:

答案 0 :(得分:1)

只需运行两个页面网址的代码:

import re
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from requests import get
for i in [1,2]:
    URL='https://www.arduinothai.com/category/2/arduino-compatible-board?tskp=' + str(i)
    Request=requests.get(URL)
    soups=BeautifulSoup(Request.text,'lxml')
    # your scrap here