import urllib2
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
url = ("http://www.justdial.com/Mumbai/CA")
driver = webdriver.Firefox()
driver.get(url)
driver
elements = driver.find_elements_by_xpath('//div[@class="col-md-12 col-xs-12 padding0"]')
for e in elements:
print e.text
url = driver.current_url
company_name = driver.find_element_by_xpath('//span[@class="jcn"]').text
contact_number = driver.find_element_by_xpath('//p[@class="contact_info"]').text
address = driver.find_element_by_xpath('//p[@class="adress_info"]').text
address_info = driver.find_element_by_xpath('//p[@class="address-info adinfoex"]').text
estd = driver.find_element_by_xpath('//li[@class="fr"]').text
ratings = driver.find_element_by_xpath('//li[@class="last"]').text
tf = 'textfile.csv'
f2 = open(tf, 'a+')
f2.write(', '.join([data.encode('utf-8') for data in [company_name]]) + ',')
f2.write(', '.join([data.encode('utf-8') for data in [contact_number]]) + ',')
f2.write(', '.join([data.encode('utf-8') for data in [address]]) + ',')
f2.write(', '.join([data.encode('utf-8') for data in [address_info]]) + ',')
f2.write(', '.join([data.encode('utf-8') for data in [estd_ratings]]) + '\n')
f2.close()
答案 0 :(得分:0)
以下内容应该让您入门。重要的是确保您的xpath条目准确选择您需要的内容。 Python的csv module
可用于将自动获得的列表转换为逗号分隔的条目,而无需添加自己的逗号:
SELECT
postcode,
( 3959 * acos( cos( radians( 53.1852582 ) ) * cos( radians( latitude ) ) * cos( radians( longitude ) - radians(-3.0198408999999997) ) + sin( radians(53.1852582) ) * sin( radians( latitude ) ) ) ) AS distance
FROM
uk_postcodes
WHERE
longitude BETWEEN <longitude1> AND <longitude2>
AND
latitude BETWEEN <latitude1> AND <latitude2>
ORDER BY
distance ASC
LIMIT 1;
这将为您提供类似以下内容的CSV文件:
import csv
import urllib2
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
def get_elements_by_xpath(driver, xpath):
return [entry.text for entry in driver.find_elements_by_xpath(xpath)]
url = ("http://www.justdial.com/Mumbai/CA")
driver = webdriver.Firefox()
driver.get(url)
search_entries = [
("CompanyName", "//span[@class='jcn']"),
("ContactNumber", "//p[@class='contact-info']/span/a"),
("Address", "//span[@class='desk-add jaddt']"),
("AddressInfo", "//p[@class='address-info adinfoex']"),
("Estd", "//li[@class='fr']"),
("Ratings", "//li[@class='last']/a/span[@class='rt_count']")]
with open('textfile.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
# Write header
csv_output.writerow([name for name, xpath in search_entries])
entries = []
for name, xpath in search_entries:
entries.append(get_elements_by_xpath(driver, xpath))
csv_output.writerows(zip(*entries))
循环请求每个xpath搜索并为每次搜索创建一个数组条目。每次搜索都会返回一个匹配数组,因此最终会得到一个条目数组。
然后需要将其写入CSV文件。 CompanyName,ContactNumber,Address,AddressInfo,Estd,Ratings
Bansal Investment & Consult...,+(91)-22-38578062,Manpada-thane West.. | more..,"CA, Tax Consultants, more...",Estd.in 2003,27 Ratings
G.Kedia & Associates,+(91)-22-38555914,"Station Road, Thane We.. | more..","CA, Company Registration Consultants, more...",Estd.in 2010,17 Ratings
Tarun Shah & Associates,+(91)-22-38552775,"Mogra Lane, Andheri Ea.. | more..","CA, Income Tax Consultants, more...",Estd.in 2000,12 Ratings
Hemant Shah And Associates LLP,+(91)-22-38588696,"Azad Road, Andheri Eas.. | more..","CA, Company Secretary, more...",Estd.in 1988,65 Ratings
按列顺序排列,CSV文件需要按行顺序写入。为此,entries
用于转换为行顺序。由于整个数组现在的顺序正确,因此可以使用zip(*entries)
的单个调用一次性写入整个文件。
使用Python的CSV库的另一个好处是,如果任何字段包含逗号,它将自动在字段周围添加引号以确保Excel不读取将其解释为另一列。请注意,您可能需要在加载时更改默认的单元格类型格式,因为Excel会尝试猜测。