我已使用Python
在Selenium
中创建了一个Web抓取工具。
我正在抓取的网站
请删除#
(要访问网站,请选择启动子,在部门中选择konkan,在地区中选择孟买市,然后选择项目,您将看到项目列表)
这是针对1个区和1个分区的。
我的抓取工具所做的事情是遍历每个地区和每个部门,选择项目并获取这些项目的详细信息。
但是如今,它仅废弃1个项目名称并停止它不会遍历项目列表,这是我收到的错误:
Error:
Traceback (most recent call last):
File "C:\Users\prince.bhatia\Desktop\Version\Maha Rera.py", line 64, in <module>
while len(selectProject.options) == 1:
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\support\select.py", line 47, in options
return self._el.find_elements(By.TAG_NAME, 'option')
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 527, in find_elements
{"using": by, "value": value})['value']
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 493, in _execute
return self._parent.execute(command, params)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 254, in execute
response = self.command_executor.execute(driver_command, params)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 464, in execute
return self._request(command_info[0], url, body=data)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 487, in _request
self._conn.request(method, parsed_url.path, body, headers)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 1285, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 1234, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 1026, in _send_output
self.send(msg)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 964, in send
self.connect()
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 936, in connect
(self.host,self.port), self.timeout, self.source_address)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\socket.py", line 722, in create_connection
raise err
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\socket.py", line 713, in create_connection
sock.connect(sa)
OSError: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import os
import time
import csv
driver = webdriver.Chrome("./chromedriver")
driver.get('https://maharerait.mahaonline.gov.in/searchlist/searchlist')
# try:
# element = WebDriverWait(driver, 100).until(
# EC.presence_of_element_located((By.ID, "Promoter"))
# )
# finally:
# print('0000000000000000000000')
# driver.quit()
time.sleep(1)
driver.find_element_by_id('Promoter').click()
divisionLength = len(Select(driver.find_element_by_id('Division')).options)
print('*********{}'.format(divisionLength))
firstRow = 0
titleRow = []
contentRows = []
for divisionElement in range(1,divisionLength):
selectDivision = Select(driver.find_element_by_id('Division'))
selectDivision.options
selectDivision.select_by_index(divisionElement)
time.sleep(1)
districtLength = len(Select(driver.find_element_by_id('District')).options)
while districtLength == 1:
pass
print(districtLength)
for districtElement in range(1,districtLength):
selectDistrict = Select(driver.find_element_by_id('District'))
selectDistrict.options
selectDistrict.select_by_index(districtElement)
time.sleep(2)
projectLength = len(Select(driver.find_element_by_id('Project')).options)
print('/------------------------------/')
print('/-----project number: {}-------/'.format(projectLength))
print('/------------------------------/')
if projectLength == 1:
continue
for projectElement in range(1,projectLength):
selectProject = Select(driver.find_element_by_id('Project'))
selectProject.options
while len(selectProject.options) == 1:
pass
# c = len(select.options)
# print('---------------{}'.format(c))
# titleRow = []
# contentRows = []
# firstRow = 0
# for i in range(1,c):
# select = Select(driver.find_element_by_id('Project'))
# while len(select.options) == 1:
# pass
time.sleep(1)
selectProject.select_by_index(projectElement)
driver.find_element_by_id('btnSearch').click()
tableRows = driver.find_element_by_class_name('table').find_elements_by_tag_name('tr')
if firstRow == 0:
headRow = tableRows[0].find_elements_by_tag_name('th')
for headRowData in range(0,len(headRow)):
text = headRow[headRowData].find_element_by_tag_name('span').text
titleRow.append(text)
firstRow = firstRow + 1
for dataRowsNumbers in range(1,len(tableRows)):
dataRow = tableRows[dataRowsNumbers].find_elements_by_tag_name('td')
tempList = []
for dataRowContents in range(0,len(dataRow)):
try:
a_link = dataRow[dataRowContents].find_element_by_tag_name('a').get_attribute('href')
tempList.append(str(a_link))
except NoSuchElementException:
tempList.append(str(dataRow[dataRowContents].text))
# if dataRow[dataRowContents].text == 'View':
# a_link = dataRow[dataRowContents].find_element_by_tag_name('a').get_attribute('href')
# tempList.append(str(a_link))
# else:
# tempList.append(str(dataRow[dataRowContents].text))
print(dataRow[dataRowContents].text)
print(tempList)
contentRows.append(tempList)
# print('Automated check is over')
# print('Stored data in programs is as below:')
# print(contentRows)
with open("./data.csv",'w') as csvfile:
csvfile = csv.writer(csvfile, delimiter=',')
csvfile.writerow(titleRow)
csvfile.writerow("")
for i in range(0,len(contentRows)):
csvfile.writerow(contentRows[i])
driver.close()
有人可以告诉我我应该怎么做才能使其运行吗?
我正在使用Python 3.6
。
它应该遍历每个地区和每个部门,并刮擦每个项目名称的详细信息。