import requests
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get('http://www.megabox.co.kr/?show=detail&rtnShowMovieCode=013491')
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
comment = soup.findAll('',{'class': 'comment'})
for i, t in enumerate(comment,1):
print('%2d: %s'%(i, t.text))
http://www.megabox.co.kr/?show=detail&rtnShowMovieCode=013491
我想抓取1页,2页,3页中的所有评论... 但我不知道怎么做。你能解释一下吗?
答案 0 :(得分:0)
我对Python很陌生,所以可能存在错误,但这是我的尝试。我尽可能地在评论方面添加了解释。
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Chrome()
browser.get('http://www.megabox.co.kr/?show=detail&rtnShowMovieCode=013491')
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
comment = soup.findAll('',{'class': 'comment'})
#Unchanged OP code until here
#Find the tag containing last page attributes
lastpage = soup.find('a', {'class':'img_btn customer last'})
#Extracts digit in the "onclick" attribute (last page number)
last = int("".join(filter(str.isdigit, lastpage.get('onclick'))))
#Assigning variables to form xpath
x = '//*[@id='
y = ']'
#Start of the range will be 2 because first page is already scraped
for i in range(2,last):
try:
#Concatenate page id value to form xpath
z=x+str(i)+y
#Click on the button to next page
browser.find_element_by_xpath(str(z)).click()
#Rinse and repeat your original code
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
#Scrape and extend your original comment variable
comment.extend(soup.findAll('',{'class': 'comment'}))
#Just in case (Please test and revert in case of issues; I had the patience to sit through just 100 pages)
except Exception:
#Find button to move to next page
browser.find_element_by_xpath('//*[@title="다음 10페이지 보기"]').click()
#Rinse and repeat same steps in the try block
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
comment.extend(soup.findAll('',{'class': 'comment'}))
continue
#I added the below in lieu of a progress bar so I know how many pages were done. You can omit this though
finally:
print('Page ' + str(i) + ' scraped!')
#OP's original output as-is
for i, t in enumerate(comment,1):
print('%2d: %s'%(i, t.text))
我真诚地希望这对你有所帮助。