我要通过网页抓取以下网页
https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber=1
但是我一直在前两个页面的前12个中获得一部分url链接,而不是第3页,而不是总链接。我使用了以下内容:
initial_url = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10'
caturl = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber={}'
r = requests.get(initial_url)
if r.status_code == 200:
Myhtml = r.text
soup = BeautifulSoup(Myhtml, 'html.parser')
#GETTING THE LAST PAGE
last_page = soup.find('div', class_='pager').find('li', class_='next-page').a['href'].split('=')[1]
#GETTING THE PAGE URL LINKS
dept_page_url = [caturl.format(i) for i in range(1, int(last_page)+1)]
time.sleep(2)
for pageurl in dept_page_url:
r = requests.get(pageurl)
if r.status_code == 200:
Myhtml = r.text
soup = BeautifulSoup(Myhtml, 'html.parser')
#GETTING THE PRODUCT LINKS
productlist = soup.find('div',attrs={'class':'item-grid'})
atagslist = productlist.findAll('a', href=True)
links_with_text = []
final_links = []
for a in atagslist:
if a.text:
mlink = a['href']
if mlink !='#':
links_with_text.append(infodomain+mlink)
#DELETE DUPLICATES
links_with_text = list(dict.fromkeys(links_with_text))
links_with_text.extend(links_with_text)
如何获取所有URL链接
答案 0 :(得分:1)
您可以模仿该页面发出的POST请求,并根据存在的下一页显示退出条件
import requests
from bs4 import BeautifulSoup as bs
headers = {
'user-agent': 'Mozilla/5.0',
'content-type': 'application/json; charset=UTF-8',
'authority': 'www.websupplies.gr',
'x-requested-with': 'XMLHttpRequest'
}
links = []
page = 1
with requests.Session() as s:
while True:
data = '{"categoryId":"405","manufacturerId":"0","vendorId":"0","priceRangeFilterModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SelectedPriceRange":{},"MinPrice":"204","MaxPrice":"3850"},"specificationFiltersModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SpecificationFilterGroups":[{"Id":"658","FilterItems":[{"Id":"4821","FilterItemState":"Unchecked"},{"Id":"1969","FilterItemState":"Unchecked"},{"Id":"4394","FilterItemState":"Unchecked"},{"Id":"1971","FilterItemState":"Unchecked"},{"Id":"5459","FilterItemState":"Unchecked"},{"Id":"1953","FilterItemState":"Unchecked"},{"Id":"1962","FilterItemState":"Unchecked"},{"Id":"1963","FilterItemState":"Unchecked"}]},{"Id":"900","FilterItems":[{"Id":"2503","FilterItemState":"Unchecked"},{"Id":"2504","FilterItemState":"Unchecked"},{"Id":"2505","FilterItemState":"Unchecked"}]},{"Id":"944","FilterItems":[{"Id":"2715","FilterItemState":"Unchecked"},{"Id":"2714","FilterItemState":"Unchecked"}]},{"Id":"980","FilterItems":[{"Id":"2994","FilterItemState":"Unchecked"},{"Id":"2835","FilterItemState":"Unchecked"},{"Id":"2836","FilterItemState":"Unchecked"},{"Id":"4381","FilterItemState":"Unchecked"}]},{"Id":"988","FilterItems":[{"Id":"2882","FilterItemState":"Unchecked"},{"Id":"2883","FilterItemState":"Unchecked"},{"Id":"2989","FilterItemState":"Unchecked"}]},{"Id":"901","FilterItems":[{"Id":"2520","FilterItemState":"Unchecked"},{"Id":"2521","FilterItemState":"Unchecked"},{"Id":"2512","FilterItemState":"Unchecked"},{"Id":"2611","FilterItemState":"Unchecked"},{"Id":"2513","FilterItemState":"Unchecked"},{"Id":"5995","FilterItemState":"Unchecked"},{"Id":"2970","FilterItemState":"Unchecked"},{"Id":"2530","FilterItemState":"Unchecked"},{"Id":"5996","FilterItemState":"Unchecked"}]},{"Id":"986","FilterItems":[{"Id":"2971","FilterItemState":"Unchecked"},{"Id":"2872","FilterItemState":"Unchecked"},{"Id":"2871","FilterItemState":"Unchecked"},{"Id":"4995","FilterItemState":"Unchecked"},{"Id":"5009","FilterItemState":"Unchecked"}]},{"Id":"761","FilterItems":[{"Id":"4358","FilterItemState":"Unchecked"},{"Id":"4359","FilterItemState":"Unchecked"},{"Id":"4361","FilterItemState":"Unchecked"},{"Id":"5460","FilterItemState":"Unchecked"},{"Id":"4362","FilterItemState":"Unchecked"},{"Id":"4822","FilterItemState":"Unchecked"},{"Id":"4371","FilterItemState":"Unchecked"}]},{"Id":"917","FilterItems":[{"Id":"4826","FilterItemState":"Unchecked"},{"Id":"4825","FilterItemState":"Unchecked"},{"Id":"5357","FilterItemState":"Unchecked"},{"Id":"4827","FilterItemState":"Unchecked"},{"Id":"5345","FilterItemState":"Unchecked"},{"Id":"4828","FilterItemState":"Unchecked"}]},{"Id":"911","FilterItems":[{"Id":"4843","FilterItemState":"Unchecked"},{"Id":"4845","FilterItemState":"Unchecked"},{"Id":"4850","FilterItemState":"Unchecked"},{"Id":"4851","FilterItemState":"Unchecked"},{"Id":"5891","FilterItemState":"Unchecked"},{"Id":"5892","FilterItemState":"Unchecked"},{"Id":"5291","FilterItemState":"Unchecked"},{"Id":"6011","FilterItemState":"Unchecked"},{"Id":"6552","FilterItemState":"Unchecked"},{"Id":"6949","FilterItemState":"Unchecked"}]}]},"attributeFiltersModel7Spikes":null,"manufacturerFiltersModel7Spikes":{"CategoryId":"405","ManufacturerFilterItems":[{"Id":"268","FilterItemState":"Unchecked"},{"Id":"63","FilterItemState":"Unchecked"},{"Id":"191","FilterItemState":"Unchecked"},{"Id":"9","FilterItemState":"Unchecked"},{"Id":"330","FilterItemState":"Unchecked"},{"Id":"5","FilterItemState":"Unchecked"}]},"vendorFiltersModel7Spikes":null,"pageNumber":"'+ str(page) + '","orderby":"10","viewmode":"list","pagesize":"48","queryString":"","shouldNotStartFromFirstPage":true,"onSaleFilterModel":null,"keyword":"","searchCategoryId":"0","searchManufacturerId":"0","priceFrom":"","priceTo":"","includeSubcategories":"False","searchInProductDescriptions":"False","advancedSearch":"False","isOnSearchPage":"False"}'
r = s.post('https://www.websupplies.gr/getFilteredProducts', headers=headers,data=data)
soup = bs(r.content, 'lxml')
links.append([item['href'] for item in soup.select('.product-title a')])
page+=1
if soup.select_one('.next-page') is None:
break
base = 'https://www.websupplies.gr'
final_list = {base + item for i in links for item in i}