我正在尝试将我的Vudu电影列表中的电影名称写入csv文件。我处于早期阶段,我无法弄清楚如何使用BeautifulSoup来获得名称。我知道它位于网站的html中。我已将它设置为现在打印位置,但所有它都以“无”返回。
到目前为止,我已经包含了我的代码进度以及我需要的网站上的html代码照片。感谢任何帮助过的人!
##Make sure to replace USERNAME and PASSWORD with your own username and password
#Import libraries
from bs4 import BeautifulSoup
from lxml import html
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import csv
import json
import re
import requests
import time
import urllib.request
#Login Information
USERNAME = "example"
PASSWORD = "example"
#URLs
login_url = "https://my.vudu.com/MyLogin.html?type=sign_in&url=https%3A%2F%2Fwww.vudu.com%2F"
url = "https://www.vudu.com/movies/#my_vudu/my_movies"
def main():
session_requests = requests.session()
chromedriver = 'C:\\chromedriver.exe'
browser = webdriver.Chrome(chromedriver)
browser.get('https://my.vudu.com/MyLogin.html?type=sign_in&url=https%3A%2F%2Fwww.vudu.com%2F')
time.sleep(10)
username = browser.find_element_by_name('email')
password = browser.find_element_by_name('password')
username.send_keys(USERNAME)
password.send_keys(PASSWORD)
browser.find_element_by_css_selector('.custom-button').click()
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
name_box = soup.find('div', attrs={'class': 'gwt-Label title'})
print (name_box)
if __name__ == '__main__':
main()
答案 0 :(得分:1)
urllib.request.urlopen(url)
(和requests.get(url)
)直接从服务器获取HTML
,这意味着它没有在网络浏览器中通过JavaScript添加元素。而且它还没有登录。
但是您的用户Selenium
加载页面并运行JavaScript,您可以获取包含browser.page_source
所有更改的HTML并在
soup = BeautifulSoup(browser.page_source, 'html.parser')
问题是如果BeautifulSoup
有Selenium
功能find_*
在页面上搜索,则使用Selenium
的原因。
编辑示例,它使用BeautifulSoup
和from selenium import webdriver
from bs4 import BeautifulSoup
import time
#chromedriver = 'C:\\chromedriver.exe'
#browser = webdriver.Chrome(chromedriver)
browser = webdriver.Firefox()
browser.get("https://www.vudu.com/")
time.sleep(1)
print('--- Selenium ---')
all_images = browser.find_elements_by_css_selector('.border .gwt-Image')
for image in all_images[:5]: # first five elements
#print('image:', image.get_attribute('src'))
print('alt:', image.get_attribute('alt'))
print('--- BeautifulSoup ---')
soup = BeautifulSoup(browser.page_source, 'html.parser')
all_images = soup.select('.border .gwt-Image')
for image in all_images[:5]: # first five elements
#print('image:', image['src'])
print('alt:', image['alt'])
--- Selenium ---
alt: It (2017)
alt: American Made
alt: Dunkirk
alt: mother!
alt: The LEGO NINJAGO Movie
--- BeautifulSoup ---
alt: It (2017)
alt: American Made
alt: Dunkirk
alt: mother!
alt: The LEGO NINJAGO Movie
结果:
Document document;
try {
document = Jsoup.connect("https://www.website.com").timeout(10000).maxBodySize(0).get();
} catch (Exception e) {
return false;
}
Elements elements = document.select("tr");
for (Element e : elements) {
System.out.println(e.text());
}
return true;