抓取和解析多页(aspx)表

时间:2019-04-02 16:33:45

标签: python-3.x

我正在尝试从以下site的aspx表中提取excel下载链接。但是我的问题是我无法提取下一页。 here也可以找到相同的内容。

import requests as requests
import re
import certifi
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup


url = "https://codal.ir/Search.aspx"

# extract data from page
def extract_data(soup):
    print("1")
    #col = soup.find_all('div', class_='RAFIcon')
    col = soup.select('div.main_table')
    print(col)
    for col in col:
        col_all = col.find_all('a')
        #print(col_all)
        for link in col_all:
            #print(link.get('href'))
            if "http://excel.codal.ir/" in link.get('href'):
                print(link.get('href'))

session = requests.Session()

response = session.get(url, verify=False)
soup = BeautifulSoup(response.content, "html.parser")

# get view state value
view_state = soup.find_all("input", {"id": "__VIEWSTATE"})[0]["value"]
print(view_state)

# get all event target values
event_target = soup.find_all("div", {"class": "pagerWrapper"})[0]
print(event_target)

#print(event_target_list)

# extract data for the 1st page
extract_data(soup)

# extract data for each page except the first

for link in event_target_list[0:]:
    print("get page {0}".format(link))
    post_data = {
        '__EVENTTARGET': link,
        '__VIEWSTATE': view_state
    }
    response = session.post(url, data=post_data)
    soup = BeautifulSoup(response.content, "html.parser")
    extract_data(soup)

0 个答案:

没有答案