我相信这对于那些对使用beautifulsoup进行网络抓取有合理知识的人来说是一个快速解决方案。我正试图从表中获取数据但由于某种原因它没有给我预期的输出。以下是我的代码:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
def main():
# BASE AND EXTENTIONS FOR EACH CURRENCY COLUMNWISE
base_cols_url='https://uk.reuters.com/assets/'
forex_cols={}
forex_cols['GBP']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=GBP'
forex_cols['EUR']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=EUR'
forex_cols['USD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=USD'
forex_cols['JPY']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=JPY'
forex_cols['CHF']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CHF'
forex_cols['AUD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=AUD'
forex_cols['CAD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CAD'
forex_cols['CNY']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CNY'
forex_cols['HKD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=HKD'
# loop through the pages
for sym in forex_cols:
print(sym)
print(base_cols_url+forex_cols[sym])
get_data_from_page(sym,base_cols_url+forex_cols[sym])
def get_data_from_page(SYMBOL,PAGE):
browser = webdriver.PhantomJS()
# PARSE THE HTML
browser.get(PAGE)
soup = BeautifulSoup(browser.page_source, "lxml")
rows = soup.findAll('td')
# PARSE ALL THE COLUMN DATA
for r in rows:
print(r) # this prints nothing
print(soup) # this prints the page but the markups are missing and replaced with '<tr><td><'
return
if __name__ == '__main__':
main()
如果我在chrome中手动加载页面,我可以看到应该能够解析的'td'和'tr'标记,但由于某种原因没有打印出来?但是,如果我只打印整个汤对象,似乎缺少标记,这就解释了为什么print(r)什么都不返回。但是,我不知道如何解析我需要的部分? (基本网页中表格中显示的数据:https://uk.reuters.com/business/currencies)。
真的想解释一下这里发生的事情吗?它看起来像json的格式,但我从来没有真正使用它,当我尝试json.loads(汤)它说它无法加载汤对象,所以我尝试json.loads(soup.text())但我得到一个ValueError:期望值:第1行第1列(char 0)。
如果有人能帮助我解析数据,真的很感激吗?非常感谢您的阅读!
答案 0 :(得分:0)
好吧,在使用json尝试失败之后,我尝试了一种令人难以置信的原始基本字符串解析方法,但它完成了工作,以防万一其他人想要做类似的事情。
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
def main():
# BASE AND EXTENTIONS FOR EACH CURRENCY COLUMNWISE
base_cols_url='https://uk.reuters.com/assets/'
forex_cols={}
forex_cols['GBP']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=GBP'
forex_cols['EUR']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=EUR'
forex_cols['USD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=USD'
forex_cols['JPY']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=JPY'
forex_cols['CHF']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CHF'
forex_cols['AUD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=AUD'
forex_cols['CAD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CAD'
forex_cols['CNY']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CNY'
forex_cols['HKD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=HKD'
for sym in forex_cols:
print(sym)
print(base_cols_url+forex_cols[sym])
get_data_from_page(sym,base_cols_url+forex_cols[sym])
def get_data_from_page(SYMBOL,PAGE):
browser = webdriver.PhantomJS()
# PARSE THE HTML
browser.get(PAGE)
soup = BeautifulSoup(browser.page_source, "lxml")
rows = str(soup).split('"row"')
# PARSE ALL THE COLUMN DATA
for r in rows:
# PARSE OUT VALUE COL
try:
print(r.split('</a></td><td>')[1].split('</td><td class=')[0])
except: IndexError
pass
# PARSE OUT CURRENCY PAIR
try:
print(r.split('sparkchart?symbols=')[1].split('=X&')[0])
except: IndexError
pass
return
if __name__ == '__main__':
main()