我正在使用Python和Scrapy抓取具有AJAX分页的网站。 我可以爬第一页。
但如果我在AJAX加载完成的第二页上下载,我无法获取其他页面的链接。
请指导我如何获取AJAX页面的链接。我使用BeautifulSoup
库进行网页编写。
class SItenameSpider(CrawlSpider):
start_urls = []
rules = (
Rule(SgmlLinkExtractor(allow=('/trends/','/keynote/')), callback='parse_item'),
)
def parse_item(self, response):
print('Hi, crawling this page! %s' % response.url)
extract_tuple_list = site_product_crawl.parse_product_page('site url')
items = []
for extract_tuple in extract_tuple_list:
item = SitenameItem()
item['site_id'] = extract_tuple[0]
item['name'] = extract_tuple[1]
item['price'] = extract_tuple[2]
item['rating']= extract_tuple[3]
item['num_reviews']= extract_tuple[4]
item['category']= cat_code
item['url'] = response.url
item['date'] = date_created
item['description'] = extract_tuple[6]
items.append(item)
return items
from bs4 import BeautifulSoup as bsoup
import requests
import pprint
import re
def return_html(url):
try:
return requests.get(url).text
except Exception as e:
print e
return None
def parse_product_page(prod_url):
#print prod_url
soup = bsoup(return_html(prod_url))
tuple_list = []
avg_rating = None
num_reviews = None
prod_category = None
prod_name = None
prod_price = None
prod_number = None
prod_price = '0' # the price is not available on site so it was put 0
#num_rev_div = soup.find('a', {'class' : 'bv-rating-label bv-text-link bv-focusable', 'href' : 'javascript:void(0)'})
url_split_prod_number = prod_url.split('://')
prod_number = url_split_prod_number[1].split('/')[1] + '_' + url_split_prod_number[1].split('/')[2].strip().encode('utf-8');
print prod_number
prod_description = soup.find('div', {'class' : 'articleText'}).get_text().strip().replace('<br/>','').encode('utf-8')
print prod_description
prod_name_div = soup.find('div', id = 'titleSection')
prod_name = prod_name_div.h2.get_text().strip().encode('utf-8');
print prod_name
num_reviews = soup.find('span',itemprop='votes').get_text().strip().encode('utf-8').replace(',','');
avg_rating = soup.find('span',{'class' :'featuredstatbox'}).find('span',itemprop='rating').get_text().strip().encode('utf-8') #get_text().strip().encode('utf-8').replace(',','');
#print price_text
#if price_text != None:
#prod_price = price_text.get_text().strip().encode('utf-8').replace('$','').replace(',','').split('-')[0].strip()
#print prod_price
tuple = (prod_number,
prod_name.strip().encode('utf-8'),
prod_price,
avg_rating,
num_reviews,
prod_category,
prod_description.replace('\n','').replace("'","''"))
tuple_list.append(tuple)
pprint.pprint(tuple_list)
return tuple_list
def main():
parse_product_page('sitename')
if __name__ == '__main__':
main()
答案 0 :(得分:1)
即使是通过ajax加载的页面也必须将请求发送到某个URL。如果您使用chrome,请在Chrome开发者工具中通过网络选项卡查找。