使用Selenium和python从打印预览页面中刮取数据

时间:2019-07-30 10:10:39

标签: python python-3.x selenium-webdriver beautifulsoup

根据我的代码,我尝试从打印预览页面抓取每一行并将其导出到CSV文件。我曾尝试过发送密钥(P50500000005)。它对于上述键(P50500000005)正常工作。但是,当我将密钥更改为另一个(P49500000001)时,却没有获得与打印预览页上提到的相同的抓取数据。我想要的代码应该刮除CSV文件上所有键的打印页。

enter code here
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'

driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver, 
  20).until(EC.element_to_be_clickable((By.XPATH,"//div[@class='search- 
  pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver, 
  10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P49500000001")  #working properly for P50500000005
Search = WebDriverWait(driver, 
  10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in 
   driver.find_elements_by_tag_name("a") if
   item.get_attribute('href') is not None]
View = View[0]
request = urllib.request.Request(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
with open("file_demo.csv" , "a") as csv_file:
        writer = csv.writer(csv_file)

        divPInfo = soup.find("div", {"id": "DivPInfo"})
        title = divPInfo.find("div", {'class': 'x_panel'}, 
        recursive=False).find("div", {'class': 'x_title'}).find(
              "h2").text.strip()
        print(title)
        csv_file.write(title + "\n")

        x_contentObject = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find_all("div", {'class': 'col-md-3'})

        my_dict = {x_contentObject[0].text.strip(): x_contentObject[1].text.strip()}

        print(my_dict)
        for key, value in my_dict.items():
              writer.writerow([key, value])

        # code for Organization :

        divPInfo1 = soup.find("div", {"id": "fldFirm"})
        title1 = divPInfo1.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
              "h2").text.strip()


        x_contentObject1 =divPInfo1.find("div", {'class': 'x_panel'}, recursive=False).find_all("div", {'class': 'col-md-3'})
        my_dict1 = {x_contentObject1[0].text.strip(): x_contentObject1[1].text.strip()}
        print(title1)
        csv_file.write(title1+ "\n")
        #print(my_dict1)
        my_dict2 = {x_contentObject1[2].text.strip(): x_contentObject1[3].text.strip()}
        #print(my_dict2)
        my_dict1.update(my_dict2)
        my_dict3 = {x_contentObject1[4].text.strip(): x_contentObject1[5].text.strip()}
        #print(my_dict3)
        my_dict1.update(my_dict3)
        my_dict4 = {x_contentObject1[6].text.strip(): x_contentObject1[7].text.strip()}
        #print(my_dict4)
        my_dict1.update(my_dict4)
        my_dict5 = {x_contentObject1[8].text.strip(): x_contentObject1[9].text.strip()}
        #print(my_dict5)
        my_dict1.update(my_dict5)
        my_dict6 = {x_contentObject1[10].text.strip(): x_contentObject1[11].text.strip()}
        #print(my_dict6)
        my_dict1.update(my_dict6)
        print(my_dict1)
        for key, value in my_dict1.items():
              writer.writerow([key, value])

        #Code for the Address:
        #title2 = divPInfo1.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
        #    "h3").text
        #print(title2)
        Address = soup.select_one('.x_title:contains("Address Details")')
        print(Address.text)
        csv_file.write(Address.text + "\n")

        my_dict7 = {x_contentObject1[12].text.strip(): x_contentObject1[13].text.strip()}
        #print(my_dict7)
        my_dict8 = {x_contentObject1[14].text.strip(): x_contentObject1[15].text.strip()}
        #print(my_dict8)
        my_dict7.update(my_dict8)
        my_dict9 = {x_contentObject1[16].text.strip(): x_contentObject1[17].text.strip()}
        #print(my_dict9)
        my_dict7.update(my_dict9)
        my_dict10 = {x_contentObject1[18].text.strip(): x_contentObject1[19].text.strip()}
        #print(my_dict10)
        my_dict7.update(my_dict10)
        my_dict11 = {x_contentObject1[20].text.strip(): x_contentObject1[21].text.strip()}
        #print(my_dict11)
        my_dict7.update(my_dict11)
        my_dict12 = {x_contentObject1[22].text.strip(): x_contentObject1[23].text.strip()}
        #print(my_dict12)
        my_dict7.update(my_dict12)
        my_dict13 = {x_contentObject1[24].text.strip(): x_contentObject1[25].text.strip()}
        #print(my_dict13)
        my_dict7.update(my_dict13)
        my_dict14 = {x_contentObject1[26].text.strip(): x_contentObject1[27].text.strip()}
        #print(my_dict14)
        my_dict7.update(my_dict14)
        my_dict15 = {x_contentObject1[28].text.strip(): x_contentObject1[29].text.strip()}
        #print(my_dict15)
        my_dict7.update(my_dict15)
        my_dict16 = {x_contentObject1[30].text.strip(): x_contentObject1[31].text.strip()}
        #print(my_dict16)
        my_dict7.update(my_dict16)
        my_dict17 = {x_contentObject1[32].text.strip(): x_contentObject1[33].text.strip()}
        #print(my_dict17)
        my_dict7.update(my_dict17)
        print(my_dict7)
        for key, value in my_dict7.items():
              writer.writerow([key, value])
        # code for the Organization Contact detail:
        Organization_Contact_Details = soup.select_one('.x_title:contains("Organization Contact Details")')
        print(Organization_Contact_Details.text)
        csv_file.write(Organization_Contact_Details.text + "\n")            
        my_dict18 = {x_contentObject1[34].text.strip(): x_contentObject1[35].text.strip()}
        #print(my_dict18)
        my_dict19 = {x_contentObject1[36].text.strip(): x_contentObject1[37].text.strip()}
        #print(my_dict19)
        my_dict18.update(my_dict19)
        print(my_dict18)
        for key, value in my_dict18.items():
              writer.writerow([key, value])

        # Member Information
        div_mem_info = soup.find("div", {"id": "fldindtxt78"})
        Mem_info_title = div_mem_info.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
              "h2").text.strip()
        print(Mem_info_title)
        csv_file.write(Mem_info_title + "\n")
        driver.get(View)
        table = pd.read_html(driver.page_source)[0]

        print(table)
        table.to_csv(csv_file , sep=',',index = False)

        # code for the Project:

        divPInfo2 = soup.find("div", {"id": "DivProject"})
        Project_title = divPInfo2.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
              "h2").text.strip()
        print(Project_title)
        csv_file.write(Project_title + "\n")

        x_contentObject1 =divPInfo2.find("div", {'class': 'x_panel'}, recursive=False).find_all("div", {'class': 'col-md-3'})
        my_dict1 = {x_contentObject1[0].text.strip(): x_contentObject1[1].text.strip()}
        #print(my_dict1)
        my_dict2 = {x_contentObject1[2].text.strip(): x_contentObject1[3].text.strip()}
        #print(my_dict2)
        my_dict1.update(my_dict2)
        my_dict3 = {x_contentObject1[4].text.strip(): x_contentObject1[5].text.strip()}
        #print(my_dict3)
        my_dict1.update(my_dict3)
        my_dict4 = {x_contentObject1[6].text.strip(): x_contentObject1[7].text.strip()}
        #print(my_dict4)
        my_dict1.update(my_dict4)
        my_dict5 = {x_contentObject1[8].text.strip(): x_contentObject1[9].text.strip()}
        #print(my_dict5)
        my_dict1.update(my_dict5)
        my_dict6 = {x_contentObject1[10].text.strip(): x_contentObject1[11].text.strip()}
        #print(my_dict6)
        my_dict1.update(my_dict6)
        my_dict7 = {x_contentObject1[12].text.strip(): x_contentObject1[13].text.strip()}
        #print(my_dict7)
        my_dict1.update(my_dict7)
        my_dict8 = {x_contentObject1[14].text.strip(): x_contentObject1[15].text.strip()}
        #print(my_dict8)
        my_dict1.update(my_dict8)
        my_dict9 = {x_contentObject1[16].text.strip(): x_contentObject1[17].text.strip()}
        #print(my_dict9)
        my_dict1.update(my_dict9)
        my_dict10 = {x_contentObject1[18].text.strip(): x_contentObject1[19].text.strip()}
        #print(my_dict10)
        my_dict1.update(my_dict10)
        my_dict11 = {x_contentObject1[20].text.strip(): x_contentObject1[21].text.strip()}
        #print(my_dict11)
        my_dict1.update(my_dict11)
        my_dict12 = {x_contentObject1[22].text.strip(): x_contentObject1[23].text.strip()}
        #print(my_dict12)
        my_dict1.update(my_dict12)
        my_dict13 = {x_contentObject1[24].text.strip(): x_contentObject1[25].text.strip()}
        #print(my_dict13)
        my_dict1.update(my_dict13)
        my_dict14 = {x_contentObject1[26].text.strip(): x_contentObject1[27].text.strip()}
        #print(my_dict14)
        my_dict1.update(my_dict14)
        my_dict15 = {x_contentObject1[28].text.strip(): x_contentObject1[29].text.strip()}
        #print(my_dict15)
        my_dict1.update(my_dict15)
        my_dict16 = {x_contentObject1[30].text.strip(): x_contentObject1[31].text.strip()}
        #print(my_dict16)
        my_dict1.update(my_dict16)
        my_dict17 = {x_contentObject1[32].text.strip(): x_contentObject1[33].text.strip()}
        #print(my_dict17)
        my_dict1.update(my_dict17)
        my_dict18 = {x_contentObject1[34].text.strip(): x_contentObject1[35].text.strip()}
        #print(my_dict18)
        my_dict1.update(my_dict18)
        my_dict19 = {x_contentObject1[36].text.strip(): x_contentObject1[37].text.strip()}
        #print(my_dict19)
        my_dict1.update(my_dict19)
        my_dict20 = {x_contentObject1[38].text.strip(): x_contentObject1[39].text.strip()}
        #print(my_dict20)
        my_dict1.update(my_dict20)
        my_dict21 = {x_contentObject1[40].text.strip(): x_contentObject1[41].text.strip()}
        #print(my_dict21)
        my_dict1.update(my_dict21)
        my_dict22 = {x_contentObject1[42].text.strip(): x_contentObject1[43].text.strip()}
        #print(my_dict22)
        my_dict1.update(my_dict22)
        my_dict23 = {x_contentObject1[44].text.strip(): x_contentObject1[45].text.strip()}
        #print(my_dict23)
        my_dict1.update(my_dict23)
        my_dict24 = {x_contentObject1[46].text.strip(): x_contentObject1[47].text.strip()}
        #print(my_dict24)
        my_dict1.update(my_dict24)
        my_dict25 = {x_contentObject1[48].text.strip(): x_contentObject1[49].text.strip()}
        #print(my_dict25)
        my_dict1.update(my_dict25)
        my_dict26 = {x_contentObject1[50].text.strip(): x_contentObject1[51].text.strip()}
        #print(my_dict26)
        my_dict1.update(my_dict26)
        print(my_dict1)
        for key, value in my_dict1.items():
              writer.writerow([key, value])


        #Code for the FSI Details:
        fsi_content = soup.select_one('.x_title:contains("FSI Details")')
        print(fsi_content.text)
        csv_file.write(fsi_content.text + "\n")

        my_dict27 = {x_contentObject1[52].text.strip(): x_contentObject1[53].text.strip()}
        #print(my_dict27)
        my_dict28 = {x_contentObject1[54].text.strip(): x_contentObject1[55].text.strip()}
        #print(my_dict28)
        my_dict27.update(my_dict28)
        my_dict29 = {x_contentObject1[56].text.strip(): x_contentObject1[57].text.strip()}
        #print(my_dict29)
        my_dict27.update(my_dict29)
        print(my_dict27)
        for key, value in my_dict27.items():
              writer.writerow([key, value])

        # Code for the Bank Details

        Bank_detail = soup.select_one('.x_title:contains("Bank Details")')
        print(Bank_detail.text)
        csv_file.write(Bank_detail.text + "\n")
        my_dict30 = {x_contentObject1[58].text.strip(): x_contentObject1[59].text.strip()}
        #print(my_dict30)
        my_dict31 = {x_contentObject1[60].text.strip(): x_contentObject1[61].text.strip()}
        #print(my_dict31)
        my_dict30.update(my_dict31)
        print(my_dict30)
        for key, value in my_dict30.items():
              writer.writerow([key, value])


        # code for the Project Details:
        divPInfo3 = soup.find("div", {"id": "DivAmenities"})

        Project_Detail_title = divPInfo3.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
              "h2").text.strip()

        print(Project_Detail_title)
        csv_file.write(Project_Detail_title + "\n")
        table = pd.read_html(driver.page_source)[1]

        print(table)
        table.to_csv(csv_file , sep=',',index = False)

        # Code for the Development Work:
        development_Work = soup.select_one('.x_title:contains("Development Work")')
        print(development_Work.text)
        csv_file.write(development_Work.text + "\n")
        table = pd.read_html(driver.page_source)[2]

        print(table)
        table.to_csv(csv_file , sep=',',index = False)

        #Code for the Building Detials:
        div_build_det = soup.find("div", {"id": "DivBuilding"})            
        building_Detials = div_build_det.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
              "h2").text.strip()
        print(building_Detials)
        csv_file.write(building_Detials + "\n")
        table = pd.read_html(driver.page_source)[3]
        table = table.drop_duplicates()           
        print(table)
        table.to_csv(csv_file , sep=',',index = False)

        # Code for the Project Professional Information:
        project_Professional_Information = soup.select_one('.x_title:contains("Project Professional Information")')
        print(project_Professional_Information.text)
        csv_file.write(project_Professional_Information.text + "\n")
        table = pd.read_html(driver.page_source)[10]                 
        print(table)
        table.to_csv(csv_file , sep=',',index = False)


              csv_file.close()

0 个答案:

没有答案