我想使用python 3.5和BeautifulSoup抓取数据https://www.arduinothai.com/category/2/arduino-compatible-board。我可以成功地在第一页上抓取数据,但无法从其他页面上抓取数据。这是我的代码
import re
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from requests import get
URL='https://www.arduinothai.com/category/2/arduino-compatible-board'
Request=requests.get(URL)
soups=BeautifulSoup(Request.text,'lxml')
''' Find All page in website
Count_Next_Pages = soups.find_all('span','tsk-all')
TotalProduct = int(Count_Next_Pages[1].text)
TotalProductPerPage = 40
TotalPages = (round(TotalProduct/TotalProductPerPage))
count=0
for i in range(int(TotalPages)):
count+=1
i='https://www.arduinothai.com/category/2/arduino-compatible-board?tskp='+str(count)
Request_Data=requests.get(i)
Soups_Data=BeautifulSoup(Request_Data.text,'lxml')
AllProduct=Soups_Data.find_all('div',class_='productDetail')
for x in AllProduct:
AllProductDeatil = x.find('a').get("gaeepd")
IDProductLink = json.loads(AllProductDeatil)["id"]
#Scrape ProductID
ProductID = x.find('span','code').get_text(strip=True)
pattren = r'[A-Z]{2}\d{5}|\d{5}|....\d{5}'
regex = re.compile(pattren)
ProDuctIDResult = regex.findall(ProductID)
ProductIDStr = ConvertListToStr(ProDuctIDResult)
ProductIDAll.append(ProductIDStr)
#Scrape Stock
URL_Prefix =requests.get('https://www.arduinothai.com/product/'+str(IDProductLink))
SoupStock = BeautifulSoup(URL_Prefix.text, 'lxml')
ChkStock = SoupStock.find('span', class_='num').text
StockOfProduct.append(ChkStock)
if((ProductCategory_jsonData==('Single Set')) or (ProductCategory_jsonData==('Triple Set')) or (ProductCategory_jsonData==('STM32'))):
ListOfProduct.append((ProductIDStr, NameOfProduct, PriceOfProduct, OldProPricesStr, ChkStock, Link_URL, ProductCategory_jsonData))
data_df = pd.DataFrame({
'ProductID': ProductIDAll,
'ProdcutName':Productname,
'Productprice':Productprice,
'OldProductPrice': OldProductPrice,
'StockOfProduct': StockOfProduct,
'Link': LinkProduct,
'Category':CategoryProduct
})
df=pd.DataFrame(ListOfProduct, columns=['ProductID', 'ProductName','Discount','Price','Stock','Link','TypeOfProduct'])
pd.set_option('display.max_rows', df.shape[0]+1)
df
答案 0 :(得分:1)
只需运行两个页面网址的代码:
import re
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from requests import get
for i in [1,2]:
URL='https://www.arduinothai.com/category/2/arduino-compatible-board?tskp=' + str(i)
Request=requests.get(URL)
soups=BeautifulSoup(Request.text,'lxml')
# your scrap here