抓取网页

时间:2020-07-11 09:41:44

标签: python html web-scraping

我写了一段代码来抓取网站https://www1.nseindia.com/live_market/dynaContent/live_watch/fxTracker/optChainDataByExpDates.jsp?symbol=USDINR&instrument=OPTCUR&expiryDt=17JUL2020 我有一个访问列而不是数据的代码。 有人可以告诉我我在做什么错吗?
我使用beautifulsoup并查看了页面的HTML结构。

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import datetime as dt,time
    import os
    from  pathlib import Path


    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) chrome/80.0.3987.132 Safari/537.36','Accept-Language': 'en-US,en;q=0.9','Accept-Encoding': 'gzip, deflate'}
    url = "https://www1.nseindia.com/live_market/dynaContent/live_watch/fxTracker/optChainDataByExpDates.jsp"
    symbol= 'USDINR'
    exp= '29JUL2020'
    page = requests.get(url, params = {"symbol": symbol,"instrument": "OPTCUR","date": exp}, headers = headers)
    page.status_code
    page.content
    soup= BeautifulSoup(page.content, 'html.parser')
    #print(soup.prettify())    

    table_it=soup.find_all(class_="opttbldata")
    table_cls_1=soup.find_all(id="octable")    

#module 1 : Getting table columns     

    col_list=[]
    for mytable in table_cls_1:
        table_head= mytable.find('thead')

    try:
        rows=table_head.find_all('tr')
        for tr in rows:
            cols = tr.find_all('th')
            for th in cols:
                er=th.text
            #ee=er.encode('utf8')
                col_list.append(er)
   
    except:
        print("no thead")
    
    col_list_fnl= [e for e in col_list if e not in ('CALLS','PUTS','Chart','\xc2\xa0','\xa0')]
    print( col_list_fnl)          

    
   #module 2: Getting Data 
    table_cls_2= soup.find( id="octable")
    all_trs = table_cls_2.find_all('tr')
    req_row = table_cls_2.find_all('tr')       

    new_table= pd.DataFrame( index= range(0, len(req_row)-3), columns = col_list_fnl)       

    row_marker=0       

    for row_number, tr_nos in enumerate(req_row):
       
        if row_number<=1 or row_number == len(req_row)-1:
            continue
       
        td_columns = tr_nos.find_all('td')
       
        select_cols = td_columns[1:22]
        cols_horizontal = range(0, len( select_cols))
        for nu, column in enumerate(select_cols):
            utf_string = column.get_text()
            utf_string = utf_string.strip('\n\r\t":')
            # tr=utf_string.decode('utf8')
            # tr=tr.replace(',' , '')
            new_table.iloc[row_marker,[nu]] = utf_string
        row_marker+= 1           

    print(new_table)
    
    new_table['Expiry'] = exp
    new_table['Date'] = date
    new_table= new_table.replace('-',"0")
 

1 个答案:

答案 0 :(得分:0)

似乎您的参数没有正确应用。我建议使用URL本身来传递您的参数,例如:

symbol= 'USDINR'
exp= '17JUL2020'
url = f"https://www1.nseindia.com/live_market/dynaContent/live_watch/fxTracker/optChainDataByExpDates.jsp?symbol={symbol}&instrument=OPTCUR&expiryDt={exp}"
page = requests.get(url, headers = headers)

这将为您选择的日期提供正确的输出。