无法使用python网页抓取包含分页和产品的网格布局页面

时间:2019-08-01 11:24:58

标签: python python-3.x

我要通过网页抓取以下网页

https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber=1

但是我一直在前两个页面的前12个中获得一部分url链接,而不是第3页,而不是总链接。我使用了以下内容:

initial_url = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10'

caturl = 'https://www.websupplies.gr/laptop#/pageSize=48&viewMode=list&orderBy=10&pageNumber={}'

    r = requests.get(initial_url)        

    if r.status_code == 200:

        Myhtml = r.text        
        soup = BeautifulSoup(Myhtml, 'html.parser')        
        #GETTING THE LAST PAGE
        last_page = soup.find('div', class_='pager').find('li', class_='next-page').a['href'].split('=')[1]
    #GETTING THE PAGE URL LINKS
    dept_page_url = [caturl.format(i) for i in range(1, int(last_page)+1)]

    time.sleep(2)      

    for pageurl in dept_page_url:        

        r = requests.get(pageurl)

        if r.status_code == 200:

            Myhtml = r.text        
            soup = BeautifulSoup(Myhtml, 'html.parser')
            #GETTING THE PRODUCT LINKS
            productlist = soup.find('div',attrs={'class':'item-grid'})    

            atagslist = productlist.findAll('a', href=True)

            links_with_text = []
            final_links = []
            for a in atagslist:
                if a.text: 
                    mlink = a['href']
                    if mlink !='#':
                        links_with_text.append(infodomain+mlink)
                #DELETE DUPLICATES
                links_with_text = list(dict.fromkeys(links_with_text))

        links_with_text.extend(links_with_text)

如何获取所有URL链接

1 个答案:

答案 0 :(得分:1)

您可以模仿该页面发出的POST请求,并根据存在的下一页显示退出条件

import requests
from bs4 import BeautifulSoup as bs
headers = {
    'user-agent': 'Mozilla/5.0',
    'content-type': 'application/json; charset=UTF-8',
    'authority': 'www.websupplies.gr',
    'x-requested-with': 'XMLHttpRequest'
}

links = []
page = 1
with requests.Session() as s:
    while True:
        data = '{"categoryId":"405","manufacturerId":"0","vendorId":"0","priceRangeFilterModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SelectedPriceRange":{},"MinPrice":"204","MaxPrice":"3850"},"specificationFiltersModel7Spikes":{"CategoryId":"405","ManufacturerId":"0","VendorId":"0","SpecificationFilterGroups":[{"Id":"658","FilterItems":[{"Id":"4821","FilterItemState":"Unchecked"},{"Id":"1969","FilterItemState":"Unchecked"},{"Id":"4394","FilterItemState":"Unchecked"},{"Id":"1971","FilterItemState":"Unchecked"},{"Id":"5459","FilterItemState":"Unchecked"},{"Id":"1953","FilterItemState":"Unchecked"},{"Id":"1962","FilterItemState":"Unchecked"},{"Id":"1963","FilterItemState":"Unchecked"}]},{"Id":"900","FilterItems":[{"Id":"2503","FilterItemState":"Unchecked"},{"Id":"2504","FilterItemState":"Unchecked"},{"Id":"2505","FilterItemState":"Unchecked"}]},{"Id":"944","FilterItems":[{"Id":"2715","FilterItemState":"Unchecked"},{"Id":"2714","FilterItemState":"Unchecked"}]},{"Id":"980","FilterItems":[{"Id":"2994","FilterItemState":"Unchecked"},{"Id":"2835","FilterItemState":"Unchecked"},{"Id":"2836","FilterItemState":"Unchecked"},{"Id":"4381","FilterItemState":"Unchecked"}]},{"Id":"988","FilterItems":[{"Id":"2882","FilterItemState":"Unchecked"},{"Id":"2883","FilterItemState":"Unchecked"},{"Id":"2989","FilterItemState":"Unchecked"}]},{"Id":"901","FilterItems":[{"Id":"2520","FilterItemState":"Unchecked"},{"Id":"2521","FilterItemState":"Unchecked"},{"Id":"2512","FilterItemState":"Unchecked"},{"Id":"2611","FilterItemState":"Unchecked"},{"Id":"2513","FilterItemState":"Unchecked"},{"Id":"5995","FilterItemState":"Unchecked"},{"Id":"2970","FilterItemState":"Unchecked"},{"Id":"2530","FilterItemState":"Unchecked"},{"Id":"5996","FilterItemState":"Unchecked"}]},{"Id":"986","FilterItems":[{"Id":"2971","FilterItemState":"Unchecked"},{"Id":"2872","FilterItemState":"Unchecked"},{"Id":"2871","FilterItemState":"Unchecked"},{"Id":"4995","FilterItemState":"Unchecked"},{"Id":"5009","FilterItemState":"Unchecked"}]},{"Id":"761","FilterItems":[{"Id":"4358","FilterItemState":"Unchecked"},{"Id":"4359","FilterItemState":"Unchecked"},{"Id":"4361","FilterItemState":"Unchecked"},{"Id":"5460","FilterItemState":"Unchecked"},{"Id":"4362","FilterItemState":"Unchecked"},{"Id":"4822","FilterItemState":"Unchecked"},{"Id":"4371","FilterItemState":"Unchecked"}]},{"Id":"917","FilterItems":[{"Id":"4826","FilterItemState":"Unchecked"},{"Id":"4825","FilterItemState":"Unchecked"},{"Id":"5357","FilterItemState":"Unchecked"},{"Id":"4827","FilterItemState":"Unchecked"},{"Id":"5345","FilterItemState":"Unchecked"},{"Id":"4828","FilterItemState":"Unchecked"}]},{"Id":"911","FilterItems":[{"Id":"4843","FilterItemState":"Unchecked"},{"Id":"4845","FilterItemState":"Unchecked"},{"Id":"4850","FilterItemState":"Unchecked"},{"Id":"4851","FilterItemState":"Unchecked"},{"Id":"5891","FilterItemState":"Unchecked"},{"Id":"5892","FilterItemState":"Unchecked"},{"Id":"5291","FilterItemState":"Unchecked"},{"Id":"6011","FilterItemState":"Unchecked"},{"Id":"6552","FilterItemState":"Unchecked"},{"Id":"6949","FilterItemState":"Unchecked"}]}]},"attributeFiltersModel7Spikes":null,"manufacturerFiltersModel7Spikes":{"CategoryId":"405","ManufacturerFilterItems":[{"Id":"268","FilterItemState":"Unchecked"},{"Id":"63","FilterItemState":"Unchecked"},{"Id":"191","FilterItemState":"Unchecked"},{"Id":"9","FilterItemState":"Unchecked"},{"Id":"330","FilterItemState":"Unchecked"},{"Id":"5","FilterItemState":"Unchecked"}]},"vendorFiltersModel7Spikes":null,"pageNumber":"'+ str(page) + '","orderby":"10","viewmode":"list","pagesize":"48","queryString":"","shouldNotStartFromFirstPage":true,"onSaleFilterModel":null,"keyword":"","searchCategoryId":"0","searchManufacturerId":"0","priceFrom":"","priceTo":"","includeSubcategories":"False","searchInProductDescriptions":"False","advancedSearch":"False","isOnSearchPage":"False"}'
        r = s.post('https://www.websupplies.gr/getFilteredProducts', headers=headers,data=data)
        soup = bs(r.content, 'lxml')
        links.append([item['href'] for item in soup.select('.product-title a')])
        page+=1
        if soup.select_one('.next-page') is None:
            break
base = 'https://www.websupplies.gr'
final_list = {base + item for i in links for item in i}