我有下面的代码。它使用硒和beautifulsoup登录到我的Yahoo电子邮件帐户,然后转到特定文件夹,并在该文件夹中获取发件人的所有电子邮件地址。我知道文件夹中大约有1000封电子邮件,但是脚本仅返回100封电子邮件。我是selenium和beautifulsoup的新手,有人知道下面的代码是否只会从页面上显示的电子邮件中返回电子邮件地址吗?如果是这样,有人可以建议我如何获取文件夹中的其余电子邮件吗?还是我没有发现其他东西?
代码:
import pandas as pd
import numpy as np
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
Options
options = Options()
chrome_options = Options()
options.add_argument('headless') #downlod Chrome driver.exe
driver = webdriver.Chrome(executable_path=os.path.abspath("/Users/username/Desktop/Stuff/Continuing_Education/Recommenders/chromedriver"), chrome_options=chrome_options)
#re allows for matching text with regular expressions (including through BeautifulSoup)
#dateutil.parser provies .parse() to convert plain text dates in a variety of formats into datetime objects
import re, dateutil.parser
#BeautifulSoup provide a model for the source HTML
from bs4 import BeautifulSoup
#Webdriver is interface to the selected browser (PhantomJS)
from selenium import webdriver
#Ability to select values in HTML <select> tags
from selenium.webdriver.support import select
import time
from selenium.webdriver.common import action_chains, keys
#system
#JSON for language agnostic output
try:
import simplejson as json
except ImportError:
import json
#id for email <input>
EMAIL_ID = 'login-signin'
#id for password <input>
PASSWORD_ID = 'login-passwd'
#id for login button
LOGIN_ID = 'LoginButton'
login_url='https://login.yahoo.com/?.src=ym&.lang=en-US&.intl=us&authMechanism=primary&done=https%3A%2F%2Fmail.yahoo.com%2Fd&eid=100&add=1'
email='madeup@yahoo.com'
password=‘fake1’
# provide login email
from selenium.webdriver.support.ui import WebDriverWait
#driver = webdriver.Firefox()
driver.get(login_url)
assert 'Next' in driver.page_source
action = action_chains.ActionChains(driver)
# input email address and click next button
action.send_keys(email+keys.Keys.ENTER)
# action.send_keys(password+keys.Keys.ENTER)
action.perform()
# password
psswd_url='https://login.yahoo.com/account/challenge/password?.src=ym&.lang=en-US&.intl=us&authMechanism=primary&done=https%3A%2F%2Fmail.yahoo.com%2Fd&add=1&display=login&yid=username&sessionIndex=Qg--&acrumb=8eRX3U0n'
driver.get(psswd_url)
assert 'login-passwd' in driver.page_source
action = action_chains.ActionChains(driver)
# input email address and click next button
action.send_keys(password+keys.Keys.ENTER)
# action.send_keys(password+keys.Keys.ENTER)
action.perform()
recruit_url='https://mail.yahoo.com/d/folders/86'
tstsoup = BeautifulSoup(driver.page_source)
rec_emails=[]
for row in range(len(tstsoup.find_all('div',{'data-test-id':'senders'}))):
rec_emails.append(tstsoup.find_all('div',{'data-test-id':'senders'})[row].find_all('span')[0]['title'])
len(rec_emails)
输出:
100