刮擦视图功能会记住其先前的迭代

时间:2018-03-28 19:15:20

标签: python ajax django

我有以下用于刮取数据的视图函数:

def results(request):
    if request.method == 'POST':
        form = RoomForm(request.POST)

        if form.is_valid():
            form_city = form.cleaned_data['city'].title()
            form_country = form.cleaned_data['country'].title()
            form_arrival_date = form.cleaned_data['arrival_date']
            form_departure_date = form.cleaned_data['departure_date']
            form_pages_to_scrape = form.cleaned_data['pages_to_scrape']

    #launch scraper   
    scraper = AIRBNB_scraper(city=form_city, country=form_country, arrival_date=str(form_arrival_date), departure_date=str(form_departure_date))
    scraped_dataframe = scraper.scrape_multiple_pages(last_page_selector_number=form_pages_to_scrape)
    scraped_dataframe_sorted = scraped_dataframe.sort_values('prices')
    print(scraped_dataframe_sorted)

    #convert scraped dataframe into lists
    prices = scraped_dataframe_sorted['prices'].tolist()
    listings_links = scraped_dataframe_sorted['listings_links'].tolist()
    listings_names = scraped_dataframe_sorted['listings_names'].tolist()
    photo_links = scraped_dataframe_sorted['photo_links'].tolist()

    dictionary = zip(prices, listings_links, listings_names, photo_links)

    context = {'dictionary': dictionary}
    return render(request, 'javascript/results.html', context)

在表单提交时,使用AJAX将发布请求发送到此函数:

var frm = $('#login-form');
frm.submit(function () {
    $.ajax({
        type: "POST",
        url: "/results",
        data: frm.serialize(),
        success: function (data) {
            $("#table").html(data);
            $('#go_back').remove();
        },
        error: function(data) {
            $("#table").html("Something went wrong!");
        }
    });
    return false;
});

之后,已删除的数据将在表格所在的同一页面上显示为HTML表格。

问题是每次表单提交完成后,已删除项目的数量会翻倍。因此,例如,如果第一次按钮点击时的抓取项目数为16,则输出将为16,但在第二次运行时,它将为32,然后是64,依此类推。

这就像应用程序记得以前的表单提交,但我没有看到任何理由。我尝试了clearin - 在这个函数的最后 - 用于存储被抓取数据的pandas数据帧以及作为上下文传递的字典,但是无济于事。

表格是:

class RoomForm(forms.Form):
    city = forms.CharField(max_length=100)
    country = forms.CharField(max_length=100)
    arrival_date = forms.DateField(widget=forms.DateInput(attrs=
                                {
                                    'class':'datepicker'
                                }), required=False)
    departure_date = forms.DateField(widget=forms.DateInput(attrs=
                                {
                                    'class':'datepicker'
                                }), required=False)
    pages_to_scrape = forms.IntegerField(label='Pages to scrape (max. 17)', min_value=0, max_value=17, widget=forms.NumberInput(attrs={'style':'width: 188px'}))

AIRBNB_scraper是:

import requests, bs4
import re
import pandas as pd

price_pattern = re.compile(r'\d*\s*?,?\s*?\d*\szł')
photo_link_pattern = re.compile(r'https.*\)')

prices = []
listings_links = []
photo_links = []
listings_names = []

class AIRBNB_scraper():

    def __init__(self, city, country, accomodation_type='homes', arrival_date='2018-03-25', departure_date='2018-04-10'):
        self.city = city
        self.country = country
        self.arrival_date = arrival_date
        self.departure_date = departure_date
        self.accomodation_type = accomodation_type

    def make_soup(self, page_number):
        url = 'https://www.airbnb.pl/s/'+ self.city +'--'+ self.country +'/'+ self.accomodation_type  +'?query='+ self.city +'%2C%20'+ self.country +'&refinement_paths%5B%5D=%2F'+ self.accomodation_type  +'&checkin=' + self.arrival_date + '&checkout=' + self.departure_date + '&section_offset=' + str(page_number)
        response = requests.get(url)  
        soup = bs4.BeautifulSoup(response.text, "html.parser")

        return soup

    def get_listings(self, page_number):

        soup = self.make_soup(page_number)
        listings = soup.select('._f21qs6')
        number_of_listings = len(listings)
        print('\n' + "Number of listings found: " + str(number_of_listings))

        while number_of_listings != 18:
            print('\n' + str(number_of_listings) + ' is not correct number of listings, it should be 18. Trying again now.')
            soup = self.make_soup(page_number)
            listings = soup.find_all('div', class_='_f21qs6')
            number_of_listings = len(listings)

        print('\n' + "All fine! The number of listings is: " + str(number_of_listings) + '. Starting scraping now')

        return listings

    def scrape_listings_per_page(self, page_number):

        listings_to_scrape = self.get_listings(page_number)

        for listing in listings_to_scrape:

            #get price
            price_container = listing.find_all('span', class_='_hylizj6')
            price_search = re.search(price_pattern, str(price_container))
            price = price_search.group()

            #get listing_link
            listing_link = 'https://www.airbnb.pl' + listing.find('a', class_='_15ns6vh')['href']

            #get photo_link
            photo_link_node = listing.find('div', class_="_1df8dftk")['style']
            photo_link_search = re.search(photo_link_pattern, str(photo_link_node))
            #~ if photo_link_search:
                #~ print('Is regex match')
            #~ else:
                #~ print('No regex match')
            photo_link_before_strip = photo_link_search.group()
            photo_link = photo_link_before_strip[:-1] #remove ") at the end of link 

            #get listing_name
            listing_name = listing.find('div', class_='_1rths372').text

            #append lists
            prices.append(price)
            listings_links.append(listing_link)
            photo_links.append(photo_link)
            listings_names.append(listing_name)

    def scrape_multiple_pages(self, last_page_selector_number):

        last_page_selector_number += 1
        for x in range(0, last_page_selector_number):#18
            self.scrape_listings_per_page(x)
            print('\n' + "INDEX OF PAGE BEING SCRAPED: " + str(x))
            scraped_data = pd.DataFrame({'prices': prices,
                                        'listings_links': listings_links,
                                        'photo_links': photo_links,
                                        'listings_names': listings_names})
        return scraped_data

1 个答案:

答案 0 :(得分:0)

您有模块级变量:priceslistings_links等。您可以在AIRBNB_scraper实例中附加到这些变量,但它们不属于该实例,并且会在调用之间保持不变。您应该将它们设为实例属性 - 在self.prices方法中将它们定义为__init__等。