我有以下用于刮取数据的视图函数:
def results(request):
if request.method == 'POST':
form = RoomForm(request.POST)
if form.is_valid():
form_city = form.cleaned_data['city'].title()
form_country = form.cleaned_data['country'].title()
form_arrival_date = form.cleaned_data['arrival_date']
form_departure_date = form.cleaned_data['departure_date']
form_pages_to_scrape = form.cleaned_data['pages_to_scrape']
#launch scraper
scraper = AIRBNB_scraper(city=form_city, country=form_country, arrival_date=str(form_arrival_date), departure_date=str(form_departure_date))
scraped_dataframe = scraper.scrape_multiple_pages(last_page_selector_number=form_pages_to_scrape)
scraped_dataframe_sorted = scraped_dataframe.sort_values('prices')
print(scraped_dataframe_sorted)
#convert scraped dataframe into lists
prices = scraped_dataframe_sorted['prices'].tolist()
listings_links = scraped_dataframe_sorted['listings_links'].tolist()
listings_names = scraped_dataframe_sorted['listings_names'].tolist()
photo_links = scraped_dataframe_sorted['photo_links'].tolist()
dictionary = zip(prices, listings_links, listings_names, photo_links)
context = {'dictionary': dictionary}
return render(request, 'javascript/results.html', context)
在表单提交时,使用AJAX将发布请求发送到此函数:
var frm = $('#login-form');
frm.submit(function () {
$.ajax({
type: "POST",
url: "/results",
data: frm.serialize(),
success: function (data) {
$("#table").html(data);
$('#go_back').remove();
},
error: function(data) {
$("#table").html("Something went wrong!");
}
});
return false;
});
之后,已删除的数据将在表格所在的同一页面上显示为HTML表格。
问题是每次表单提交完成后,已删除项目的数量会翻倍。因此,例如,如果第一次按钮点击时的抓取项目数为16,则输出将为16,但在第二次运行时,它将为32,然后是64,依此类推。
这就像应用程序记得以前的表单提交,但我没有看到任何理由。我尝试了clearin - 在这个函数的最后 - 用于存储被抓取数据的pandas数据帧以及作为上下文传递的字典,但是无济于事。
表格是:
class RoomForm(forms.Form):
city = forms.CharField(max_length=100)
country = forms.CharField(max_length=100)
arrival_date = forms.DateField(widget=forms.DateInput(attrs=
{
'class':'datepicker'
}), required=False)
departure_date = forms.DateField(widget=forms.DateInput(attrs=
{
'class':'datepicker'
}), required=False)
pages_to_scrape = forms.IntegerField(label='Pages to scrape (max. 17)', min_value=0, max_value=17, widget=forms.NumberInput(attrs={'style':'width: 188px'}))
AIRBNB_scraper是:
import requests, bs4
import re
import pandas as pd
price_pattern = re.compile(r'\d*\s*?,?\s*?\d*\szł')
photo_link_pattern = re.compile(r'https.*\)')
prices = []
listings_links = []
photo_links = []
listings_names = []
class AIRBNB_scraper():
def __init__(self, city, country, accomodation_type='homes', arrival_date='2018-03-25', departure_date='2018-04-10'):
self.city = city
self.country = country
self.arrival_date = arrival_date
self.departure_date = departure_date
self.accomodation_type = accomodation_type
def make_soup(self, page_number):
url = 'https://www.airbnb.pl/s/'+ self.city +'--'+ self.country +'/'+ self.accomodation_type +'?query='+ self.city +'%2C%20'+ self.country +'&refinement_paths%5B%5D=%2F'+ self.accomodation_type +'&checkin=' + self.arrival_date + '&checkout=' + self.departure_date + '§ion_offset=' + str(page_number)
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings(self, page_number):
soup = self.make_soup(page_number)
listings = soup.select('._f21qs6')
number_of_listings = len(listings)
print('\n' + "Number of listings found: " + str(number_of_listings))
while number_of_listings != 18:
print('\n' + str(number_of_listings) + ' is not correct number of listings, it should be 18. Trying again now.')
soup = self.make_soup(page_number)
listings = soup.find_all('div', class_='_f21qs6')
number_of_listings = len(listings)
print('\n' + "All fine! The number of listings is: " + str(number_of_listings) + '. Starting scraping now')
return listings
def scrape_listings_per_page(self, page_number):
listings_to_scrape = self.get_listings(page_number)
for listing in listings_to_scrape:
#get price
price_container = listing.find_all('span', class_='_hylizj6')
price_search = re.search(price_pattern, str(price_container))
price = price_search.group()
#get listing_link
listing_link = 'https://www.airbnb.pl' + listing.find('a', class_='_15ns6vh')['href']
#get photo_link
photo_link_node = listing.find('div', class_="_1df8dftk")['style']
photo_link_search = re.search(photo_link_pattern, str(photo_link_node))
#~ if photo_link_search:
#~ print('Is regex match')
#~ else:
#~ print('No regex match')
photo_link_before_strip = photo_link_search.group()
photo_link = photo_link_before_strip[:-1] #remove ") at the end of link
#get listing_name
listing_name = listing.find('div', class_='_1rths372').text
#append lists
prices.append(price)
listings_links.append(listing_link)
photo_links.append(photo_link)
listings_names.append(listing_name)
def scrape_multiple_pages(self, last_page_selector_number):
last_page_selector_number += 1
for x in range(0, last_page_selector_number):#18
self.scrape_listings_per_page(x)
print('\n' + "INDEX OF PAGE BEING SCRAPED: " + str(x))
scraped_data = pd.DataFrame({'prices': prices,
'listings_links': listings_links,
'photo_links': photo_links,
'listings_names': listings_names})
return scraped_data
答案 0 :(得分:0)
您有模块级变量:prices
,listings_links
等。您可以在AIRBNB_scraper实例中附加到这些变量,但它们不属于该实例,并且会在调用之间保持不变。您应该将它们设为实例属性 - 在self.prices
方法中将它们定义为__init__
等。