我正在尝试抓取动态加载的网页。页面的动态加载效果很好,但是当我使用beautifulSoup抓取源代码时,似乎每次迭代中源代码文件都不会变大。在加载页面的第二部分之后,我希望源代码的时间大约是原来的两倍。有人对我做错了什么提示吗? while循环中的beautifulSoup和print命令当然仅是用于尝试了解正在发生的情况的测试。
def getlinks(self, hashtag, url):
browser = webdriver.Safari()
browser.get(url)
last_height = browser.execute_script("return document.body.scrollHeight")
rows_list = []
screen_scroll_count = 0
scrollNumber = 3
while True:
# scroll down to the bottom
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
screen_scroll_count += 1
time.sleep(2)
# Calculate new scroll height
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
if screen_scroll_count >= scrollNumber:
break
source = browser.page_source
soup = bs(source, 'html.parser')
body = soup.find('body')
script = body.find('script')
print('source length = ' + str(len(str(source))))
print('soup length = ' + str(len(str(soup))))
print('body length = ' + str(len(str(body))))
print('script length = ' + str(len(str(script))))
source = browser.page_source
browser.quit()
soup = bs(source, 'html.parser')
body = soup.find('body')
script = body.find('script')
page_json = script.text.split(' = ', 1)[1].rstrip(';')
data = json.loads(page_json)
print('source length = ' + str(len(str(source))))
print('soup length = ' + str(len(str(soup))))
print('body length = ' + str(len(str(body))))
print('script length = ' + str(len(str(script))))
for i, post in enumerate(data['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']['edges']):
meta = {}
try:
meta['src'] = post['node']['thumbnail_resources'][1]['src'].replace(';', '')