我想在以下网址页面中获取完整的pdf链接列表: 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
问题是网页在内部使用javascript来显示链接,而我无法获得pdf链接。
实际上,我试图通过谷歌搜索找到各种方式进行解析。但我失败了。 你能建议解决问题的正确方法吗?
以下是我尝试但失败的代码:
def crawle_kiwoom_mletter():
if not os.path.exists(dir_output_mletter):
os.makedirs(dir_output_mletter)
#urlformat = 'https://www.kiwoom.com/nkw.template.do?m=m0601010101&s_menu=ML&s_sqno=4784'
urlformat = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
index = -1
while True:
index = index + 1
url = urlformat.format(index)
print('processing {}...'.format(url))
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
#print_anchors(soup)
print(soup.prettify())
'''
if browse_mbriefing_linkpages(soup) == False:
break
'''
break
'''
https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/
'''
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def crawl_kiwoom_mletter2():
url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
url='http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000&source=&xdr='
#This does the magic.Loads everything
r = Render(url)
#result is a QString.
result = r.frame.toHtml()
print(result)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
'''
http://stackoverflow.com/questions/28289699/python-web-scraping-for-javascript-generated-content
'''
def crawl_kiwoom_mletter3():
browser = webdriver.Firefox()
url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
browser.get(url)
res = browser.page_source
print(res)
driver.close()
答案 0 :(得分:0)
使用python2和BeautifulSoup4尝试此代码:
from bs4 import BeautifulSoup
import re
import urllib, urllib2
def browse(page):
url = 'http://bbn.kiwoom.com/bbn.marketConditionMLList.do'
values = {
'pageno': page,
'basePath': '4',
's_startdate': '20120822',
's_enddate': '20200222',
}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
page=urllib2.urlopen(req)
soup = BeautifulSoup(page.read())
aTagAll = soup.find_all('a', {'class': 'file'})
for aTag in aTagAll:
downloadFile(getParams( aTag ))
page+=1
def getParams(aTag):
params = {}
m = re.search(r"openFile\('([^']*)','([^']*)','([^']*)", aTag['onclick'])
params['realname'] = m.group(1)
params['filename'] = m.group(2)
params['snMakedate'] = m.group(3)
return params
def downloadFile(params):
print 'Downloading : %s' % params['filename']
url = 'http://bbn.kiwoom.com/bbn.fileDownload.do'
values = {
's_realname': params['realname'],
's_filename': params['filename'],
's_snMakedate': params['snMakedate'],
'pageno': '8',
'basePath': '4'
}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
try: response = urllib2.urlopen(req)
except urllib2.HTTPError as e:
print e.code
print e.read()
file = open(params['filename'], 'w')
file.write(response.read())
file.close()
for pagenum in range(1, 58):
browse(page=pagenum)
它从pdf列表页面获取所有链接,并使用getParams函数解析它们。
使用urllib2 python模块将params和一个额外的basePath参数发送到下载URL。
我建议你在每个请求之间添加一个延迟,以防止服务器过载。
更新:
它现在浏览1到58页(实际页数)并解析所有链接。