如何使用javascript获取网页?

时间:2016-02-21 05:26:23

标签: python selenium beautifulsoup web-crawler

我想在以下网址页面中获取完整的pdf链接列表: 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'

问题是网页在内部使用javascript来显示链接,而我无法获得pdf链接。

实际上,我试图通过谷歌搜索找到各种方式进行解析。但我失败了。 你能建议解决问题的正确方法吗?

以下是我尝试但失败的代码:

def crawle_kiwoom_mletter():
    if not os.path.exists(dir_output_mletter):
        os.makedirs(dir_output_mletter)

    #urlformat = 'https://www.kiwoom.com/nkw.template.do?m=m0601010101&s_menu=ML&s_sqno=4784'
    urlformat = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'

    index = -1
    while True:
        index = index + 1
        url = urlformat.format(index)
        print('processing {}...'.format(url))
        page = urllib.request.urlopen(url)

        soup = BeautifulSoup(page, 'lxml')

        #print_anchors(soup)

        print(soup.prettify())
        '''
        if browse_mbriefing_linkpages(soup) == False:
            break
        '''
        break

'''
https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/
'''

import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *  
from lxml import html 

class Render(QWebPage):  
  def __init__(self, url):  
    self.app = QApplication(sys.argv)  
    QWebPage.__init__(self)  
    self.loadFinished.connect(self._loadFinished)  
    self.mainFrame().load(QUrl(url))  
    self.app.exec_()  

  def _loadFinished(self, result):  
    self.frame = self.mainFrame()  
    self.app.quit() 


def crawl_kiwoom_mletter2():
    url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
    url='http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000&source=&xdr='
    #This does the magic.Loads everything
    r = Render(url)  
    #result is a QString.
    result = r.frame.toHtml()

    print(result)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

'''
http://stackoverflow.com/questions/28289699/python-web-scraping-for-javascript-generated-content
'''    
def crawl_kiwoom_mletter3():

    browser = webdriver.Firefox()
    url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
    browser.get(url)
    res = browser.page_source

    print(res)

    driver.close()   

1 个答案:

答案 0 :(得分:0)

使用python2和BeautifulSoup4尝试此代码:

from bs4 import BeautifulSoup
import re
import urllib, urllib2

def browse(page):
    url = 'http://bbn.kiwoom.com/bbn.marketConditionMLList.do'
    values = {
    'pageno': page,
    'basePath': '4',
    's_startdate': '20120822',
    's_enddate': '20200222',
    }

    data = urllib.urlencode(values)
    req = urllib2.Request(url, data)
    page=urllib2.urlopen(req)
    soup = BeautifulSoup(page.read())

    aTagAll = soup.find_all('a', {'class': 'file'})

    for aTag in aTagAll:
        downloadFile(getParams( aTag ))
    page+=1

def getParams(aTag):
    params = {}
    m = re.search(r"openFile\('([^']*)','([^']*)','([^']*)", aTag['onclick'])
    params['realname'] = m.group(1)
    params['filename'] = m.group(2)
    params['snMakedate'] = m.group(3)
    return params

def downloadFile(params):
  print 'Downloading : %s' % params['filename']
  url = 'http://bbn.kiwoom.com/bbn.fileDownload.do'
  values = {
    's_realname': params['realname'],
    's_filename': params['filename'], 
    's_snMakedate': params['snMakedate'], 
        'pageno': '8',
    'basePath': '4'
  }
  data = urllib.urlencode(values)

  req = urllib2.Request(url, data)
  try: response = urllib2.urlopen(req)
  except urllib2.HTTPError as e:
    print e.code 
    print e.read()

  file = open(params['filename'], 'w')
  file.write(response.read())
  file.close()

for pagenum in range(1, 58):
    browse(page=pagenum)

它从pdf列表页面获取所有链接,并使用getParams函数解析它们。

使用urllib2 python模块将params和一个额外的basePath参数发送到下载URL。

我建议你在每个请求之间添加一个延迟,以防止服务器过载。

更新:

它现在浏览1到58页(实际页数)并解析所有链接。