我正在努力从网站上抓取数据并将数据写入CSV文件。 然后写入文件更改页面后继续进行到最后一页。 我只从第一行获取数据。我做错了什么。 请纠正我。
import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver =webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
for i in range(1,101,1) :
sNo = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[1]")
district = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[2]/a")
districtName = driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[2]").text
state= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[3]").text
population= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[4]").text
growth= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[5]").text
sexRatio= driver.find_element_by_xpath("//table[@id='DataTables_Table_0']/tbody/tr['+str(i)+']/td[6]").text
districtLink = district.get_attribute("href")
print(districtName,state,population,growth,sexRatio,districtLink)
with open(r"D:\python36_files\censusDistrictData.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio","DistrictLink"]
writer = csv.DictWriter(csvfile, fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio", "DistrictLink"],restval='none',delimiter = ';')
writer.writerow({'DistrictName': districtName, 'State': state, 'Population':population, 'Growth':growth, 'SexRatio':sexRatio, 'DistrictLink': districtLink})
print ("OK")
driver.quit()
和另一个问题: 我试过刮掉CSS Selector但是不能在col。之后放一个分隔符。
import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver =webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
districtTable = driver.find_elements_by_css_selector("#DataTables_Table_0")
for row in districtTable:
print(row.text)
with open(r"D:\python36_files\censusDistrictData1.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = ["Row"],restval='none',delimiter = ';')
writer.writerow({"Row":row.text}) #pl.let me know how to insert ; after a data to a csvfile
print ("OK")
driver.quit()
答案 0 :(得分:1)
看起来你刚刚在for
循环
for i in range(1,101):
trXpath = "//table[@id='DataTables_Table_0']/tbody/tr[" + str(i) + "]"
sNo = driver.find_element_by_xpath(trXpath + "/td[1]")
districtLink = driver.find_element_by_xpath(trXpath + "/td[2]/a").get_attribute("href")
districtName = driver.find_element_by_xpath(trXpath + "/td[2]").text
state= driver.find_element_by_xpath(trXpath + "/td[3]").text
population= driver.find_element_by_xpath(trXpath + "/td[4]").text
growth= driver.find_element_by_xpath(trXpath + "/td[5]").text
sexRatio= driver.find_element_by_xpath(trXpath + "/td[6]").text
print(districtName, state, population, growth, sexRatio, districtLink)
我已经简化了一些代码,以便于阅读和维护。
答案 1 :(得分:0)
这个问题由我解决,但在Python中没有正确的方法,请详细指导我。
import re
import os
import csv
import sys
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
driver = webdriver.PhantomJS()
driver.get('http://www.census2011.co.in/district.php')
driver.maximize_window()
select = Select(driver.find_element_by_xpath(".//div[@id='DataTables_Table_0_length']/label/div/select") )
select.select_by_visible_text("100")
i = 0
for i in range(0, 100):
i += 1
baseXpath = "//table[@id='DataTables_Table_0']/tbody/tr["
row = str(i)
trXpath = baseXpath + row + ']'
sNo = driver.find_element_by_xpath(trXpath + "/td[1]")
districtLink = driver.find_element_by_xpath(trXpath + "/td[2]/a").get_attribute("href")
districtName = driver.find_element_by_xpath(trXpath + "/td[2]").text
state = driver.find_element_by_xpath(trXpath + "/td[3]").text
population = driver.find_element_by_xpath(trXpath + "/td[4]").text
growth = driver.find_element_by_xpath(trXpath + "/td[5]").text
sexRatio = driver.find_element_by_xpath(trXpath + "/td[6]").text
print(districtName,';', state,';', population,';', growth,';', sexRatio)
with open(r"D:\python36_files\censusDistrictData.csv",'a',newline = "\n", encoding='utf-8') as csvfile:
fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio","DistrictLink"]
writer = csv.DictWriter(csvfile, fieldnames = ["DistrictName", "State", "Population", "Growth","SexRatio", "DistrictLink"],restval='none',delimiter = ';')
writer.writerow({'DistrictName': districtName, 'State': state, 'Population':population, 'Growth':growth, 'SexRatio':sexRatio, 'DistrictLink': districtLink})
print ("OK")
driver.quit()