如何在Windows 10上使用Selenium和Python 3.6刮取表

时间:2017-03-31 15:19:17

标签: python selenium python-3.6

我正在努力从网站上抓取数据并将数据写入CSV文件。 然后写入文件更改页面后继续进行到最后一页。 我只从第一行获取数据。我做错了什么。 请纠正我。

import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver =webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()

select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")

for i in range(1,101,1) :
    sNo = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[1]")
    district = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[2]/a")
    districtName = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[2]").text
    state= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[3]").text
    population= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[4]").text
    growth= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[5]").text
    sexRatio= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[6]").text
    districtLink = district.get_attribute("href")
    print(districtName,state,population,growth,sexRatio,districtLink)


with open(r"D:\python36_files\censusDistrictData.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
    fieldnames = ["DistrictName", "State", "Population",  "Growth","SexRatio","DistrictLink"]
    writer = csv.DictWriter(csvfile, fieldnames = ["DistrictName", "State", "Population",  "Growth","SexRatio", "DistrictLink"],restval='none',delimiter = ';')
    writer.writerow({'DistrictName': districtName, 'State': state, 'Population':population,  'Growth':growth, 'SexRatio':sexRatio, 'DistrictLink': districtLink})

print ("OK")
driver.quit()

和另一个问题: 我试过刮掉CSS Selector但是不能在col。之后放一个分隔符。

import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains

driver =webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()

select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
districtTable = driver.find_elements_by_css_selector("#DataTables_Table_0")
for row in districtTable:
    print(row.text)
with open(r"D:\python36_files\censusDistrictData1.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames = ["Row"],restval='none',delimiter = ';')
    writer.writerow({"Row":row.text}) #pl.let me know how to insert ; after a data to a csvfile


print ("OK")
driver.quit()               

2 个答案:

答案 0 :(得分:1)

看起来你刚刚在for循环

下省略了缩进
for i in range(1,101):
    trXpath = "//table[@id='DataTables_Table_0']/tbody/tr[" + str(i) + "]"
    sNo = driver.find_element_by_xpath(trXpath + "/td[1]")
    districtLink = driver.find_element_by_xpath(trXpath + "/td[2]/a").get_attribute("href")
    districtName = driver.find_element_by_xpath(trXpath + "/td[2]").text
    state= driver.find_element_by_xpath(trXpath + "/td[3]").text
    population= driver.find_element_by_xpath(trXpath + "/td[4]").text
    growth= driver.find_element_by_xpath(trXpath + "/td[5]").text
    sexRatio= driver.find_element_by_xpath(trXpath + "/td[6]").text
    print(districtName, state, population, growth, sexRatio, districtLink)

我已经简化了一些代码,以便于阅读和维护。

答案 1 :(得分:0)

这个问题由我解决,但在Python中没有正确的方法,请详细指导我。

import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from  selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver = webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select =     Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
i = 0
for i in range(0, 100):
    i += 1    
    baseXpath = "//table[@id='DataTables_Table_0']/tbody/tr["
    row = str(i)
    trXpath = baseXpath + row + ']'
    sNo = driver.find_element_by_xpath(trXpath + "/td[1]")
    districtLink = driver.find_element_by_xpath(trXpath + "/td[2]/a").get_attribute("href")
    districtName = driver.find_element_by_xpath(trXpath + "/td[2]").text
    state = driver.find_element_by_xpath(trXpath + "/td[3]").text
    population = driver.find_element_by_xpath(trXpath + "/td[4]").text
    growth = driver.find_element_by_xpath(trXpath + "/td[5]").text
    sexRatio = driver.find_element_by_xpath(trXpath + "/td[6]").text
    print(districtName,';', state,';', population,';', growth,';', sexRatio)
    with open(r"D:\python36_files\censusDistrictData.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
        fieldnames = ["DistrictName", "State", "Population",  "Growth","SexRatio","DistrictLink"]
        writer = csv.DictWriter(csvfile, fieldnames = ["DistrictName", "State", "Population",  "Growth","SexRatio", "DistrictLink"],restval='none',delimiter = ';')
        writer.writerow({'DistrictName': districtName, 'State': state, 'Population':population,  'Growth':growth, 'SexRatio':sexRatio, 'DistrictLink': districtLink})

print ("OK")
driver.quit()