我正在尝试使用谷歌“快速回答框”文本。访问屏幕截图时,我应该清楚“快速回答框”的含义:
如果您输入搜索并且Google知道答案,Google会显示此框。因此,您无需打开下面显示的链接之一。如果您输入以下查询,则会显示该框:
https://google.de/search?q=definition%20calcium
现在我想通过python脚本阅读这篇文章。我写了一个方法,使用请求和美丽的汤来实现这一目标:
def execute(self):
response = requests.get(url='https://google.de/search?q=definition%20calcium', proxies=self._proxy)
soup = BeautifulSoup(response.content, 'html.parser')
return soup.find_all("ol", class_="lr_dct_sf_sens")
方法始终返回[],表示空列表。但是,如果我使用chrome控制台,我可以找到这个术语:
所以我无法理解为什么找不到这个。为了测试,我将requests.get
中的全部内容写入一个文件:
file = open('C:\\Users\\me\\Desktop\\test.txt', 'w')
file.write(response.text)
file.close()
尝试用记事本搜索文件,但我也不能在那里搜索模式。不确定response.text
是否会削减一些细节。
有人在那里,谁可以向我解释这个?我怎样才能得到这个文本?
答案 0 :(得分:2)
如果您在加载该网页时密切关注网络请求,则会看到Google会启动另一个包含您数据的链接。
请尝试在浏览器中访问此内容:
https://www.google.com/search?q=definition:+calcium&bav=on.2,or.r_cp.&cad=b&fp=1&biw=1920&bih=984&dpr=1&tch=1&ech=1&psi=1489578048971.3
它会下载您的fastbox数据可用的文件。您可以在该文件中搜索the chemical element of atomic number
来验证这一点。
您必须清理文件并清除所需的数据。
答案 1 :(得分:0)
SerpApi完全支持Google直接答案框中的字典结果。例如:
"C:\Program Files\CMake\bin\cmake.exe" -SD:\git\MyProject\src -BD:\git\MyProject\bin --check-build-system CMakeFiles\Makefile.cmake 0
"C:\Program Files\CMake\bin\cmake.exe" -E cmake_progress_start D:\git\MyProject\bin\CMakeFiles D:\git\MyProject\bin\CMakeFiles\progress.marks
C:/MinGW/bin/mingw32-make.exe -f CMakeFiles\Makefile2 all
mingw32-make.exe[1]: Entering directory 'D:/git/MyProject/bin'
mingw32-make.exe[1]: Nothing to be done for 'all'.
mingw32-make.exe[1]: Leaving directory 'D:/git/MyProject/bin'
"C:\Program Files\CMake\bin\cmake.exe" -E cmake_progress_start D:\git\MyProject\bin\CMakeFiles 0 ```
一些有关字典结果的文档在这里:https://serpapi.com/direct-answer-box-api
答案 2 :(得分:0)
在我看来,最简单的方法是使用 SelectorGadget Chrome 扩展与 from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
from datetime import datetime
import json
TEAMS_URL = 'https://teams.microsoft.com/_#/calendarv2'
sleepDelay = 50 # increase if you have a slow internet connection
timeOutDelay = 50 # increase if you have a slow internet connection
curParticipants = 0
minParticipants = 10
opt = Options()
opt.add_argument("--disable-infobars")
opt.add_argument("start-maximized")
opt.add_argument("--disable-extensions")
# Pass the argument 1 to allow and 2 to block
opt.add_experimental_option("prefs", {"profile.default_content_setting_values.media_stream_mic": 1,
"profile.default_content_setting_values.media_stream_camera": 1,
"profile.default_content_setting_values.notifications": 1
})
browser = webdriver.Chrome(ChromeDriverManager().install(), options=opt)
def wait_and_find_ele_by_id(html_id, timeout=timeOutDelay):
sleep(sleepDelay)
for i in range(timeout):
try:
ele = browser.find_element_by_id(html_id)
except:
sleep(sleepDelay)
else:
return ele
def wait_and_find_ele_by_link_text(text, timeout=timeOutDelay):
sleep(sleepDelay)
for i in range(timeout):
try:
ele = browser.find_element_by_link_text(text)
except:
sleep(sleepDelay)
else:
return ele
def wait_and_find_element_by_xpath(xpath, timeout=timeOutDelay):
sleep(sleepDelay)
for i in range(timeout):
try:
ele = browser.find_element_by_xpath(xpath)
except:
sleep(sleepDelay)
else:
return ele
def wait_and_find_elements_by_xpath(xpath, timeout=timeOutDelay):
sleep(sleepDelay)
for i in range(timeout):
try:
ele = browser.find_elements_by_xpath(xpath)
except:
sleep(sleepDelay)
else:
return ele
def check_and_join_meeting():
global curParticipants
joins = wait_and_find_elements_by_xpath('//button[.="Join"]', 3)
if len(joins) == 0: # no meeting scheduled
return
joins[-1].click() # join the latest meeting scheduled i.e if join buttons for 9, 10 A.M available, will join 10 A.M
elem = wait_and_find_element_by_xpath(
'//*[@id="page-content-wrapper"]/div[1]/div/calling-pre-join-screen/div/div/div[2]/div[1]/div['
'2]/div/div/section/div[2]/toggle-button[1]/div/button')
if elem.get_attribute('aria-pressed') == 'true': # turn off camera
elem.click()
elem = wait_and_find_element_by_xpath('//*[@id="preJoinAudioButton"]/div/button')
if elem.get_attribute('aria-pressed') == 'true': # turn off microphone
elem.click()
wait_and_find_element_by_xpath('//button[.="Join now"]').click() # join meeting
print('Joined the meeting at {}'.format(datetime.now()))
sleep(60 * 5)
browser.execute_script("document.getElementById('roster-button').click()")
sleep(sleepDelay)
num_str = wait_and_find_elements_by_xpath(
'//span[@class="toggle-number"][@ng-if="::ctrl.enableRosterParticipantsLimit"]')
if len(num_str) >= 2:
if num_str[1].text[1:-1] != '':
curParticipants = int(num_str[1].text[1:-1])
else:
browser.execute_script("document.getElementById('roster-button').click()")
def check_and_end_or_leave_or_join_meeting():
global curParticipants, minParticipants
hangup_btn = wait_and_find_element_by_xpath('//button[@id="hangup-button"]', 2)
if hangup_btn is not None: # currently in meeting
num_str = wait_and_find_elements_by_xpath(
'//span[@class="toggle-number"][@ng-if="::ctrl.enableRosterParticipantsLimit"]')
if len(num_str) >= 2:
if num_str[1].text[1:-1] != '':
curParticipants = int(num_str[1].text[1:-1])
else:
browser.execute_script("document.getElementById('roster-button').click()")
if curParticipants <= minParticipants and curParticipants != 0: # leaves meeting for given condition
browser.execute_script("document.getElementById('hangup-button').click()")
print('Left meeting at {}'.format(datetime.now()))
browser.get(TEAMS_URL) # open calendar tab
browser.refresh()
sleep(5)
else:
return
else:
curParticipants = 0
browser.get(TEAMS_URL)
browser.refresh()
sleep(5)
check_and_join_meeting()
def init():
global minParticipants
browser.get(TEAMS_URL) # open calendar tab in teams
sleep(sleepDelay)
with open('config.json') as f:
data = json.load(f)
minParticipants = data['minimumParticipants']
wait_and_find_ele_by_id('i0116').send_keys(data['username']) # enter username
wait_and_find_ele_by_id('idSIButton9').click() # click next
wait_and_find_ele_by_id('i0118').send_keys(data['password']) # enter password
wait_and_find_ele_by_id('idSIButton9').click() # click next
wait_and_find_ele_by_id('idSIButton9').click() # click yes to stay signed in
web_ele = wait_and_find_ele_by_link_text('Use the web app instead', 5)
if web_ele is not None:
web_ele.click()
while wait_and_find_element_by_xpath('//button[@title="Switch your calendar view"]') is None:
sleep(5) # wait for calendar tab to completely load
while wait_and_find_element_by_xpath('//button[@title="Switch your calendar view"]').get_attribute('name') != "Day":
wait_and_find_element_by_xpath('//button[@title="Switch your calendar view"]').click()
wait_and_find_element_by_xpath('//button[@name="Day"]').click() # change calender work-week view to day view
print('Initialized Successfully at {}'.format(datetime.now()))
check_and_join_meeting()
def main():
global browser
try:
init()
except:
print('init failed, trying again')
main()
else:
while True:
try:
check_and_end_or_leave_or_join_meeting()
except:
print('join meeting failed, trying again')
browser.get(TEAMS_URL) # open calendar tab in teams
else:
sleep(10)
if __name__ == "__main__":
main()
或 select()
select_one()
{{3 }}。
另外,问题可能是您没有指定 beautifulsoup
。 user-agent
用于伪造真实用户访问,因此 Google(或其他网站)不会阻止请求。
User-agent
或者,您可以使用来自 SerpApi 的 methods 做同样的事情,除非您不必弄清楚如何获取某些 HTML 元素。这是一个付费 API,可免费试用 5,000 次搜索。
要集成的代码:
from bs4 import BeautifulSoup
import requests, lxml
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
html = requests.get('https://www.google.de/search?q=definition%20calcium', headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
syllables = soup.select_one('.frCXef span').text
phonetic = soup.select_one('.g30o5d span span').text
noun = soup.select_one('.h3TRxf span').text
print(f'{syllables}\n{phonetic}\n{noun}')
# Output:
'''
cal·ci·um
ˈkalsēəm
the chemical element of atomic number 20, a soft gray metal.
'''
<块引用>
免责声明,我为 SerpApi 工作。