使用Scrapy在Python中进行AJAX分页Web爬行

时间:2013-12-23 12:14:33

标签: python ajax web-scraping beautifulsoup

我正在使用Python和Scrapy抓取具有AJAX分页的网站。 我可以爬第一页。

但如果我在AJAX加载完成的第二页上下载,我无法获取其他页面的链接。

请指导我如何获取AJAX页面的链接。我使用BeautifulSoup库进行网页编写。

class SItenameSpider(CrawlSpider):
start_urls = []
rules = (
    Rule(SgmlLinkExtractor(allow=('/trends/','/keynote/')), callback='parse_item'),
)

def parse_item(self, response):
    print('Hi, crawling this page! %s' % response.url)


    extract_tuple_list =  site_product_crawl.parse_product_page('site url')
    items = []

    for extract_tuple in  extract_tuple_list:
      item = SitenameItem()
      item['site_id'] = extract_tuple[0]
      item['name'] = extract_tuple[1]
      item['price'] = extract_tuple[2]
      item['rating']=  extract_tuple[3]
      item['num_reviews']=  extract_tuple[4]
      item['category']=  cat_code
      item['url'] = response.url
      item['date'] = date_created 
      item['description'] = extract_tuple[6]
      items.append(item)
    return items

from bs4 import BeautifulSoup as bsoup
import requests
import pprint
import re

def return_html(url):
    try:
        return requests.get(url).text
    except Exception as e:
        print e
        return None

def parse_product_page(prod_url):
    #print prod_url
    soup = bsoup(return_html(prod_url))
    tuple_list = []
    avg_rating = None
    num_reviews = None
    prod_category = None
    prod_name = None
    prod_price = None
    prod_number = None



prod_price = '0' # the price is not available on site so it was put 0
#num_rev_div = soup.find('a', {'class' : 'bv-rating-label bv-text-link bv-focusable', 'href' : 'javascript:void(0)'})
url_split_prod_number = prod_url.split('://')
prod_number = url_split_prod_number[1].split('/')[1] + '_' + url_split_prod_number[1].split('/')[2].strip().encode('utf-8');
print prod_number
prod_description = soup.find('div', {'class' : 'articleText'}).get_text().strip().replace('<br/>','').encode('utf-8')
print prod_description    
prod_name_div = soup.find('div', id = 'titleSection')
prod_name = prod_name_div.h2.get_text().strip().encode('utf-8');
print prod_name

num_reviews = soup.find('span',itemprop='votes').get_text().strip().encode('utf-8').replace(',','');
avg_rating = soup.find('span',{'class' :'featuredstatbox'}).find('span',itemprop='rating').get_text().strip().encode('utf-8')            #get_text().strip().encode('utf-8').replace(',','');   


#print price_text
#if price_text != None:
  #prod_price = price_text.get_text().strip().encode('utf-8').replace('$','').replace(',','').split('-')[0].strip()
#print prod_price
tuple = (prod_number,
prod_name.strip().encode('utf-8'),
prod_price,
avg_rating,
num_reviews,
prod_category,
prod_description.replace('\n','').replace("'","''"))
tuple_list.append(tuple)

pprint.pprint(tuple_list)
return tuple_list

def main():
  parse_product_page('sitename')

if __name__ == '__main__':
    main()

1 个答案:

答案 0 :(得分:1)

即使是通过ajax加载的页面也必须将请求发送到某个URL。如果您使用chrome,请在Chrome开发者工具中通过网络选项卡查找。