在网站上检索多个页面以获取数据

时间:2018-11-17 11:48:51

标签: python-3.x web-crawler

嗨,我正尝试在以下网站上抓取有关沙发的数据。我已经写了这个履带;随附的代码如下。搜寻器会在第一页上不断向我返回相同的30个搜索结果。似乎搜寻器无法转到下一页。

https://www.horchow.com/search.jsp?N=0&Ntt=couches#endecaDrivenSiloRefinements=0&personalizedPriorityProdId=&userConstrainedResults=true&refinements=&page=1&pageSize=120&sort=MAX_PROMO_PRICE%7C1&definitionPath=/nm/commerce/pagedef_rwd/etemplate/Search&onlineOnly=&updateFilter=false&allStoresInput=false&rwd=true&catalogId=&selectedRecentSize=&activeFavoriteSizesCount=0&activeInteraction=true

    ## Web scraper code to search for product images and their 
       associated price from e-commerce websites.
# Inputs required:
# 1) A CSV file containing a list of web-page links containing the 
     results for a particular query
# 2) File path where the images and the product prices have to be 
      stored

import requests
import csv
from bs4 import BeautifulSoup as bs
import urllib.request as urdown
import pandas as pd
import validators

#r_url = re.compile(r"^https?:")
data_folder = "/Users/adasgupta/Documents/Softwares/Pycharm/Data/" 
 #modify this to the path where downloaded product images and prices 
 are to be stored
data_extn = ".jpg"
data_extn2 = ".txt"
data_prices = "prices_couches.csv"
prod_prices_mat = pd.DataFrame()

# Reading the list of URLs file
with open('horchow_couches.csv') as csv_file: #modify this to read the 
relevant CSV file containing the URLs of all the results pages
    csv_reader = csv.reader(csv_file, delimiter = ',')
    tot_count = 0 #modify this to name the product images from a 
    particular starting number
    for row in csv_reader:
        #print(f'\t Reading {row[0]}')
        url_list = row[0]


        url_list = url_list.replace(u'\ufeff', '')
        page = requests.get(url_list, allow_redirects=False)

        if(page.status_code==200):
            soup = bs(page.content, 'html.parser')
            count = 0
            #home_cl = soup.find_all("div", {"class": "mobile- 
             header"})
            home_a_cl = soup.find_all("a", {"class": "mobile-logo"})
            home_page = home_a_cl[0].get('href')

            list_cl = soup.find_all("li",{"class": "category-item"})
            print("No of results found = ", len(list_cl))
            for i in range(len(list_cl)):
                list_a_cl = list_cl[i].find_all("a", {"class": 
                "product-thumbnail-image"})
                if (len(list_a_cl)!=0):
                    page_link = home_page + list_a_cl[0].get('href')
                    print(page_link)

            ## visiting each of the link_pages to extract necessary 
            information
                prod_page = requests.get(page_link)
                soup2 = bs(prod_page.content, 'html.parser')
                if(prod_page.status_code==200):
                    ## extracting price info
                    prod_price = 
                    soup2.find(itemprop="price").get_text()
                    prod_price = prod_price.replace('\t', '')
                    prod_price = prod_price.replace('\n', '')
                    prod_price = prod_price.replace('$', '')
                    prod_price = prod_price.replace(',', '')
                    prod_price = float(prod_price)
                    print(prod_price)
                    ## extracting image info
                    prod_grid1 = soup2.find_all("div",{"class": "hero- 
                    zoom-frame gutter-top-half mobile-grid-100 tablet- 
                    grid-100 grid-100 grid-parent line-item-data"})
                    prod_grid2 = prod_grid1[0].find_all("img", 
                    {"itemprop": "image"})
                    prod_im = prod_grid2[0].get("src")
                    prod_met_data = prod_grid1[0].find_all("div", 
                    {"itemprop": "description"})

                    im_no = str(1+ count + tot_count)
                    str_data = prod_met_data[0]

                    f = open(data_folder + im_no + data_extn2, 'w')
                    print(str_data, file=f)
                    f.close()

                    prod_prices_mat = 
                    prod_prices_mat.append([[prod_im, im_no, 
                     prod_price]], ignore_index=True)
                    # saving the image
                    try:
                        urdown.urlretrieve(prod_im, 
                       (data_folder+im_no+data_extn))
                    except:
                        continue
                    count += 1
                print(count)
        tot_count += count
        print(f'Parsed {tot_count}.')
## writing product_prices to a csv file
prod_prices_mat.to_csv((data_folder+data_prices), sep = ',', 
encoding='utf-8')

基本上,我不断地从第1页获取所有数据,以获取连续页面的所有URL。

0 个答案:

没有答案