基于href刮除多个页面,但是在python中存在多个页面的数据连接问题

时间:2019-05-28 06:46:57

标签: python selenium-webdriver beautifulsoup

我想使用python和Selenium Webdriver通过href抓取多页数据,但是我面临的问题是从第一页到最后一页的数据连接。我正在使用:

url = http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Buldhana

我发现了以下错误,用于连接和导出多个页面中单个文件中的数据

错误屏幕截图:

error screeshot

我的代码:

import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Buldhana'
chrome_path =r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'

d = webdriver.Chrome(executable_path=chrome_path)
d.implicitly_wait(10)
d.get(url)
with open('Sample_buldhana.csv', 'w',encoding='utf-16',newline='') as csvfile:
    f = csv.writer(csvfile, dialect='excel')

    Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('1')
    Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1458') 

   tableElement = d.find_element_by_id(
    'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
   table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
   #print(table)
   table.columns = table.iloc[0]
   table = table.iloc[1:]
   #print(type(table))
   table = table[table.Select == 'SurveyNo']
   #print(table) #assumption SurveyNo exists for all wanted rows
   surveyNo_scripts = [item.get_attribute('href') for item in 
   d.find_elements_by_css_selector(
    "#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Select$']")]
   #print(surveyNo_scripts)
   i = 0
   for script in surveyNo_scripts:
        d.execute_script(script)
        surveys = d.find_element_by_css_selector('textarea').text
        table.iloc[[i],table.columns.get_loc('Select')] = surveys
        i += 1   
   print(table)

   #rename and re-order columns as required

   while [page.get_attribute('href')for page in 
   d.find_elements_by_css_selector( 
   "#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Page$']")]:
        try:
            page_no =[page.get_attribute('href')for page in 
            d.find_elements_by_css_selector( 
            "#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate  [href*='Page$']")]
            print(page_no)


           for script_page in page_no:
               d.execute_script(script_page)
               tableElement = d.find_element_by_css_selector( 
               "#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate")
               table1 = 
               pd.read_html(tableElement.get_attribute('outerHTML'))[0]
               table1.columns = table1.iloc[0]
               table1 = table1.iloc[1:]
               #print(type(table))
               table1 = table1[table1.Select == 'SurveyNo']
               #print(table) #assumption SurveyNo exists for all wanted 
               rows
               surveyNo_scripts = [item.get_attribute('href') for item in 
               d.find_elements_by_css_selector(
               "#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate  [href*='Select$']")]
               #print(surveyNo_scripts)
               i = 0
               for script in surveyNo_scripts:
                   d.execute_script(script)
                   surveys = 
                   d.find_element_by_css_selector('textarea').text
                   table1.iloc[[i],table1.columns.get_loc('Select')] = 
                   surveys
                   i += 1   
                   #print(table1)
                   #table = 
                   table.append(table1.reindex(columns=table.columns))
                   table = pd.concat([table, table1], 
                   axis=0,join_axes=None,ignore_index=False)
                   #table = pd.concat([table, table1]) 
                   print(table)
         except:
            break 
  table.to_csv(r"C:\Users\Guest\Desktop\Sample_buldhana.csv", sep=',', 
  encoding='utf-8-sig',index = False ) 

1 个答案:

答案 0 :(得分:0)

这是一个警告。为什么不将建议的参数添加到签名?

table = pd.concat([table, table1], sort = False) #or True if you want the sort