我尝试从此页面的每个下拉菜单中收集列表数据。我可以使用Selenium Python 3.6访问'li'标签部分并收集'href'数据。但问题是我无法获取每个列表的文本数据。
我的代码如下:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from time import sleep
link = 'http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I'
driver = webdriver.PhantomJS()
driver.set_window_size(1920, 1080)
driver.get(link)
sleep(.75)
soup = BeautifulSoup(driver.page_source, "html.parser", from_encoding='utf-8')
manufacturers = [
('%s' % o.text, '%s' % o.get_attribute('href'))
for o
in driver.find_elements_by_css_selector("#layer_maker ul.list li a")
if o.text != '전체']
for manufacturer in manufacturers:
print(manufacturer)
我的结果如下:
('', "javascript:selChange('maker', '0', '%EC%A0%84%EC%B2%B4');")
('', "javascript:selChange('maker', '1', 'BMW');")
('', "javascript:selChange('maker', '21', '%EB%B2%A4%EC%B8%A0');")
('', "javascript:selChange('maker', '32', '%EC%95%84%EC%9A%B0%EB%94%94');")
('', "javascript:selChange('maker', '44', '%ED%8F%AD%EC%8A%A4%EB%B0%94%EA%B2%90');")
('', "javascript:selChange('maker', '13', '%EB%A0%89%EC%84%9C%EC%8A%A4');")
('', "javascript:selChange('maker', '97', '%EB%AF%B8%EB%8B%88');")
('', "javascript:selChange('maker', '2', 'GM');")
('', "javascript:selChange('maker', '77', 'GMC');")
('', "javascript:selChange('maker', '5', '%EB%8B%9B%EC%82%B0');")
('', "javascript:selChange('maker', '6', '%EB%8B%A4%EC%9D%B4%ED%95%98%EC%93%B0');")
('', "javascript:selChange('maker', '7', '%EB%8B%B7%EC%A7%80');")
('', "javascript:selChange('maker', '9', '%EB%8F%84%EC%9A%94%ED%83%80');")
('', "javascript:selChange('maker', '10', '%EB%9E%80%EC%B9%98%EC%95%84');")
('', "javascript:selChange('maker', '11', '%EB%9E%8C%EB%B3%B4%EB%A5%B4%EA%B8%B0%EB%8B%88');")
('', "javascript:selChange('maker', '12', '%EB%9E%9C%EB%93%9C%EB%A1%9C%EB%B2%84');")
('', "javascript:selChange('maker', '14', '%EB%A1%9C%EB%B2%84');")
('', "javascript:selChange('maker', '15', '%EB%A1%9C%ED%84%B0%EC%8A%A4');")
('', "javascript:selChange('maker', '16', '%EB%A1%A4%EC%8A%A4%EB%A1%9C%EC%9D%B4%EC%8A%A4');")
('', "javascript:selChange('maker', '61', '%EB%A5%B4%EB%85%B8');")
('', "javascript:selChange('maker', '17', '%EB%A7%81%EC%BB%A8');")
('', "javascript:selChange('maker', '18', '%EB%A7%88%EC%84%B8%EB%9D%BC%ED%8B%B0');")
('', "javascript:selChange('maker', '19', '%EB%A7%88%EC%AF%94%EB%8B%A4');")
('', "javascript:selChange('maker', '1003', '%EB%A7%A5%EB%9D%BC%EB%A0%8C');")
('', "javascript:selChange('maker', '60', '%EB%A8%B8%ED%81%90%EB%A6%AC');")
('', "javascript:selChange('maker', '20', '%EB%AF%B8%EC%93%B0%EB%B9%84%EC%8B%9C');")
('', "javascript:selChange('maker', '82', '%EB%AF%B8%EC%AF%94%EC%98%A4%EC%B9%B4');")
('', "javascript:selChange('maker', '22', '%EB%B2%A4%ED%8B%80%EB%A6%AC');")
('', "javascript:selChange('maker', '23', '%EB%B3%BC%EB%B3%B4');")
('', "javascript:selChange('maker', '1009', '%EB%B6%81%EA%B8%B0%EC%9D%80%EC%83%81');")
('', "javascript:selChange('maker', '88', '%EB%B6%80%EA%B0%80%ED%8B%B0');")
('', "javascript:selChange('maker', '24', '%EB%B7%B0%EC%9D%B5');")
('', "javascript:selChange('maker', '99', '%EB%B9%84%EC%9D%B4%EC%8A%A4%EB%A7%8C');")
('', "javascript:selChange('maker', '25', '%EC%82%AC%EB%B8%8C');")
('', "javascript:selChange('maker', '94', '%EC%83%88%ED%84%B4');")
('', "javascript:selChange('maker', '29', '%EC%89%90%EB%B3%B4%EB%A0%88');")
('', "javascript:selChange('maker', '27', '%EC%8A%A4%EB%B0%94%EB%A3%A8');")
('', "javascript:selChange('maker', '28', '%EC%8A%A4%EC%A6%88%ED%82%A4');")
('', "javascript:selChange('maker', '103', '%EC%8A%A4%EC%B9%B4%EB%8B%88%EC%95%84');")
('', "javascript:selChange('maker', '93', '%EC%8A%A4%ED%8C%8C%EC%9D%B4%EC%BB%A4');")
('', "javascript:selChange('maker', '30', '%EC%8B%9C%ED%8A%B8%EB%A1%9C%EC%97%A5');")
('', "javascript:selChange('maker', '33', '%EC%95%8C%ED%8C%8C%EB%A1%9C%EB%A9%94%EC%98%A4');")
('', "javascript:selChange('maker', '62', '%EC%95%A0%EC%8A%A4%ED%84%B4%EB%A7%88%ED%8B%B4');")
('', "javascript:selChange('maker', '95', '%EC%96%B4%ED%81%90%EB%9D%BC');")
('', "javascript:selChange('maker', '34', '%EC%98%A4%ED%8E%A0');")
('', "javascript:selChange('maker', '1011', '%EC%98%A4%EC%8A%A4%ED%8B%B4');")
('', "javascript:selChange('maker', '35', '%EC%98%AC%EC%A6%88%EB%AA%A8%EB%B9%8C');")
('', "javascript:selChange('maker', '83', '%EC%9B%A8%EC%8A%A4%ED%8A%B8%ED%95%84%EB%93%9C');")
('', "javascript:selChange('maker', '36', '%EC%9D%B4%EC%8A%A4%EC%A6%88');")
('', "javascript:selChange('maker', '81', '%EC%9D%B8%ED%94%BC%EB%8B%88%ED%8B%B0');")
('', "javascript:selChange('maker', '37', '%EC%9E%AC%EA%B7%9C%EC%96%B4');")
('', "javascript:selChange('maker', '96', '%EC%A7%80%ED%94%84');")
('', "javascript:selChange('maker', '1006', '%ED%85%8C%EC%8A%AC%EB%9D%BC');")
('', "javascript:selChange('maker', '38', '%EC%BA%90%EB%94%9C%EB%9D%BD');")
('', "javascript:selChange('maker', '89', '%EC%BD%94%EB%8B%89%EC%84%B8%ED%81%AC');")
('', "javascript:selChange('maker', '39', '%ED%81%AC%EB%9D%BC%EC%9D%B4%EC%8A%AC%EB%9F%AC');")
('', "javascript:selChange('maker', '84', '%ED%8C%8C%EA%B0%80%EB%8B%88');")
('', "javascript:selChange('maker', '41', '%ED%8E%98%EB%9D%BC%EB%A6%AC');")
('', "javascript:selChange('maker', '42', '%ED%8F%AC%EB%93%9C');")
('', "javascript:selChange('maker', '43', '%ED%8F%AC%EB%A5%B4%EC%89%90');")
('', "javascript:selChange('maker', '1008', '%ED%8F%AC%ED%86%A4');")
('', "javascript:selChange('maker', '45', '%ED%8F%B0%ED%8B%B0%EC%95%85');")
('', "javascript:selChange('maker', '46', '%ED%91%B8%EC%A1%B0');")
('', "javascript:selChange('maker', '91', '%ED%94%BC%EC%8A%A4%EC%BB%A4');")
('', "javascript:selChange('maker', '47', '%ED%94%BC%EC%95%84%ED%8A%B8');")
('', "javascript:selChange('maker', '48', '%ED%97%88%EB%A8%B8');")
('', "javascript:selChange('maker', '50', '%ED%98%BC%EB%8B%A4');")
('', "javascript:selChange('maker', '76', '%ED%99%80%EB%8D%B4');")
('', "javascript:selChange('maker', '4', '%EA%B8%B0%ED%83%80 %EC%88%98%EC%9E%85%EC%B0%A8');")
我不明白为什么文本部分是空的并且所有韩文字母都被打破(韩文字母是javascript中的第三个元素:selChange)。我想做的是完成文本部分并更正韩文字母。
请帮忙。
答案 0 :(得分:1)
尝试使用以下代码:
from urllib import parse
...
manufacturers = [
(o.get_attribute('text'), parse.unquote(o.get_attribute('href')))
for o
in driver.find_elements_by_css_selector("#layer_maker ul.list li a")
if o.get_attribute('text') != '전체']
适用于制造商的制造商: 印刷品(制造商)
输出:
('BMW', "javascript:selChange('maker', '1', 'BMW');")
('벤츠', "javascript:selChange('maker', '21', '벤츠');")
('아우디', "javascript:selChange('maker', '32', '아우디');")
('폭스바겐', "javascript:selChange('maker', '44', '폭스바겐');")
...