这是较早的工作,这是旧代码。我在进行网络抓取时经验不足 这是早期工作的遗留代码,最近发生错误。该表位于下面的链接中。
这是完整的代码和错误行,可以正常工作
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nsepy import get_history
from datetime import date
from datetime import datetime
Base_url =("https://www.nseindia.com/live_market/dynaContent/"+
"live_watch/option_chain/optionKeys.jsp?symbolCode=2541&symbol=ITC&"+
"symbol=UBL&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17")
page = requests.get(Base_url)
page.status_code
page.content
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
#Added this code for checking the underlying value
table_1 =soup.find_all(style="float:right; font-size:1.2em;")
for table in table_1:
underlysingscript = table.select('span')
table_it = soup.find_all(class_="opttbldata")
table_cls_1 = soup.find_all(id="octable")
col_list = []
# The code given below will pull the headers of the Option Chain table
for mytable in table_cls_1:
table_head = mytable.find('thead')
try:
rows = table_head.find_all('tr')
for tr in rows:
cols = tr.find_all('th')
for th in cols:
er = th.text
ee = er.encode('utf8')
ee = str(ee, 'utf-8')
col_list.append(ee)
except:
print ("no thead")
col_list_fnl = [e for e in col_list if e not in ('CALLS','PUTS','Chart','\xc2\xa0','\xa0')]
table_cls_2 = soup.find(id="octable")
all_trs = table_cls_2.find_all('tr')
req_row = table_cls_2.find_all('tr')
new_table = pd.DataFrame(index=range(0,len(req_row)-3) , columns=col_list_fnl)
此行中的数据变为NAN。为什么以及问题是什么? 可以更好地编写此代码来避免此问题吗?
答案 0 :(得分:1)
您正在创建一个空的数据框。创建数据框时,可以通过index=
指定行数,可以通过columns=
指定列数,但是绝对不要在其中放入任何值/数据。因此,行和列的数据帧以及每个单元格中带有NAN的值。
我还要说,您为获得这张桌子而付出的辛勤工作。熊猫可以为您完成这项工作。然后只需操纵数据框以适合您想要的内容即可:
import pandas as pd
import requests
Base_url =("https://www.nseindia.com/live_market/dynaContent/"+
"live_watch/option_chain/optionKeys.jsp?symbolCode=2541&symbol=ITC&"+
"symbol=UBL&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17")
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
response = requests.get(Base_url, headers = headers)
tables = pd.read_html(response.text)
df = tables[1][2:].reset_index(drop=True)
df.columns = tables[1].iloc[1]
df = df.drop('Chart', axis=1)
输出:
print (df)
1 OI Chng in OI Volume IV ... IV Volume Chng in OI OI
0 - - - - ... - - - -
1 - - - - ... - - - -
2 - - - - ... - - - -
3 - - - - ... - - - -
4 - - - - ... - - - -
5 - - - - ... - - - -
6 - - - - ... - - - -
7 4800 - - - ... 43.15 93 57600 141600
8 - - - - ... 37.31 20 26400 48000
9 2400 - 1 62.34 ... 33.31 996 770400 3427200
10 4800 - 1 35.32 ... 30.48 705 621600 1003200
11 88800 31200 47 32.52 ... 28.13 4476 1572000 2510400
12 184800 117600 116 30.78 ... 25.50 3284 873600 1545600
13 3165600 2085600 2481 29.05 ... 24.67 7621 979200 2769600
14 3184800 2265600 3308 29.73 ... 24.77 4472 424800 1497600
15 7579200 5167200 10875 32.14 ... 25.98 4858 1142400 2409600
16 5640000 3117600 9499 35.38 ... 33.36 1029 96000 302400
17 10680000 4572000 16342 38.06 ... 46.34 128 -4800 182400
18 2848800 1161600 3523 41.92 ... 47.42 4 -2400 24000
19 6751200 2724000 5638 44.97 ... - 1 - 45600
20 405600 -26400 562 48.56 ... - - - 2400
21 1077600 319200 1113 47.61 ... - 21 -2400 249600
22 21600 4800 17 54.15 ... - - - 4800
23 252000 170400 196 55.95 ... - 2 -4800 57600
24 - - - - ... - - - -
25 - - 1 60.47 ... - - - 7200
26 - - - - ... - - - -
27 4800 - - - ... - - - 40800
28 - - - - ... - - - -
29 - - - - ... - - - -
30 - - - - ... - - - -
31 - - - - ... - 7 -14400 4800
32 41896800 NaN 53720 NaN ... NaN NaN NaN NaN
[33 rows x 21 columns]
BeautifulSoup选项:
我不得不修改一些代码,以确保列数和排队的夜晚数相等。我也不会以任何方式清除它(即替换"-"
,删除空列或空行,重置索引等)。但希望能给您一些帮助:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
#from nsepy import get_history
from datetime import date
from datetime import datetime
Base_url =("https://www.nseindia.com/live_market/dynaContent/"+
"live_watch/option_chain/optionKeys.jsp?symbolCode=2541&symbol=ITC&"+
"symbol=UBL&instrument=OPTSTK&date=-&segmentLink=17&segmentLink=17")
page = requests.get(Base_url)
page.status_code
page.content
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
#Added this code for checking the underlying value
table_1 =soup.find_all(style="float:right; font-size:1.2em;")
for table in table_1:
underlysingscript = table.select('span')
table_it = soup.find_all(class_="opttbldata")
table_cls_1 = soup.find_all(id="octable")
col_list = []
# The code given below will pull the headers of the Option Chain table
for mytable in table_cls_1:
table_head = mytable.find('thead')
try:
rows = table_head.find_all('tr')
for tr in rows:
cols = tr.find_all('th')
for th in cols:
er = th.text
ee = er.encode('utf8')
ee = str(ee, 'utf-8')
col_list.append(ee)
except:
print ("no thead")
col_list_fnl = [e for e in col_list if e not in ('CALLS','PUTS','\xc2\xa0','\xa0')]
table_cls_2 = soup.find(id="octable")
req_row = table_cls_2.find_all('tr')
new_table = pd.DataFrame()
for row in req_row:
td = row.find_all('td')
data = [ ele.text for ele in td ]
temp_df = pd.DataFrame([data])
new_table = new_table.append(temp_df)
new_table.columns = col_list_fnl
输出:
print (new_table)
Chart OI Chng in OI ... Chng in OI OI Chart
0 NaN NaN NaN ... NaN NaN NaN
0 NaN NaN NaN ... NaN NaN NaN
0 - - ... - -
0 - - ... - -
0 - - ... - -
0 - - ... - -
0 - - ... - -
0 - - ... 2,400 2,400
0 - - ... - -
0 4,800 - ... 110,400 194,400
0 - - ... 40,800 62,400
0 4,800 2,400 ... 165,600 2,822,400
0 4,800 - ... 576,000 957,600
0 86,400 28,800 ... 1,276,800 2,215,200
0 220,800 153,600 ... 664,800 1,336,800
0 3,151,200 2,071,200 ... 708,000 2,498,400
0 3,204,000 2,284,800 ... 314,400 1,387,200
0 7,231,200 4,819,200 ... 770,400 2,037,600
0 5,469,600 2,947,200 ... 81,600 288,000
0 10,612,800 4,504,800 ... -2,400 184,800
0 2,632,800 945,600 ... -4,800 21,600
0 6,523,200 2,496,000 ... - 45,600
0 391,200 -40,800 ... - 2,400
0 1,072,800 314,400 ... -2,400 249,600
0 21,600 4,800 ... - 4,800
0 249,600 168,000 ... -4,800 57,600
0 - - ... - -
0 2,400 2,400 ... - 7,200
0 - - ... - -
0 4,800 - ... - 40,800
0 - - ... - -
0 - - ... - -
0 - - ... - -
0 - - ... -14,400 4,800
0 Total 40,888,800 ... NaN NaN NaN
[35 rows x 23 columns]
答案 1 :(得分:0)
逻辑上没有错,它正确地执行了作业。 罪魁祸首就是这个。
获取列列表并进行打印后,在网站'\ xc2 \ xa0'中找到该列,最近已将其删除。由于我们尝试消除了导致问题的原因。通过更改下面的代码,一切正常。
==