我想使用python和Selenium Webdriver通过href抓取多页数据,但是我面临的问题是从第一页到最后一页的数据连接。我正在使用:
url = http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Buldhana
我发现了以下错误,用于连接和导出多个页面中单个文件中的数据
错误屏幕截图:
我的代码:
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Buldhana'
chrome_path =r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
d = webdriver.Chrome(executable_path=chrome_path)
d.implicitly_wait(10)
d.get(url)
with open('Sample_buldhana.csv', 'w',encoding='utf-16',newline='') as csvfile:
f = csv.writer(csvfile, dialect='excel')
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('1')
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1458')
tableElement = d.find_element_by_id(
'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
#print(table)
table.columns = table.iloc[0]
table = table.iloc[1:]
#print(type(table))
table = table[table.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys = d.find_element_by_css_selector('textarea').text
table.iloc[[i],table.columns.get_loc('Select')] = surveys
i += 1
print(table)
#rename and re-order columns as required
while [page.get_attribute('href')for page in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Page$']")]:
try:
page_no =[page.get_attribute('href')for page in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Page$']")]
print(page_no)
for script_page in page_no:
d.execute_script(script_page)
tableElement = d.find_element_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate")
table1 =
pd.read_html(tableElement.get_attribute('outerHTML'))[0]
table1.columns = table1.iloc[0]
table1 = table1.iloc[1:]
#print(type(table))
table1 = table1[table1.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted
rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys =
d.find_element_by_css_selector('textarea').text
table1.iloc[[i],table1.columns.get_loc('Select')] =
surveys
i += 1
#print(table1)
#table =
table.append(table1.reindex(columns=table.columns))
table = pd.concat([table, table1],
axis=0,join_axes=None,ignore_index=False)
#table = pd.concat([table, table1])
print(table)
except:
break
table.to_csv(r"C:\Users\Guest\Desktop\Sample_buldhana.csv", sep=',',
encoding='utf-8-sig',index = False )
答案 0 :(得分:0)
这是一个警告。为什么不将建议的参数添加到签名?
table = pd.concat([table, table1], sort = False) #or True if you want the sort