我在抓Twitter网站。 (API不用于检索历史数据。)。较小的范围会更快出现,但如果你的范围更大,你将不得不花费大量的时间。
此外,如果连接在很长一段时间后停止,则会出现错误。
所以我的疑问是,有没有办法加快网络抓取与Selenium?或者我需要修改我的代码吗?
#python3
import requests
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
browser = webdriver.PhantomJS('C:\phantomjs-2.1.1-windows/bin/phantomjs')
url =u'https://twitter.com/search?f=tweets&vertical=default&q=%EC%84%B8%EC%9B%94%ED%98%B8%20since%3A2014-04-16%20until%3A2014-04-17&src=typd&lang=ko'
browser.get(url)
time.sleep(1)
body = browser.find_element_by_tag_name('body')
for _ in range(10000):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.1)
tweets=browser.find_elements_by_class_name('tweet-text')
wfile = open("140416.txt", mode='w', encoding='utf8')
data={}
i = 1
for i, tweet in enumerate(tweets):
data['text'] = tweet.text
print(i, ":", data)
wfile.write(str(data) +'\n')
i += 1
wfile.close()
答案 0 :(得分:0)
1
。有很多数据。根据我的脚本,有8509条推文。所以它需要~425向下滚动。每次向下滚动可能需要0.8-5秒。因此,向下滚动大约需要14-20分钟。
2。重复向下滚动10,000次并不能保证它会加载所有推文。向下滚动后你最好计算元素,检查它是否增加。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
"(KHTML, like Gecko) Chrome/15.0.87"
)
browser = webdriver.PhantomJS(desired_capabilities=dcap, executable_path='path')
url =u'https://twitter.com/search?f=tweets&vertical=default&q=%EC%84%B8%EC%9B%94%ED%98%B8%20since%3A2014-04-16%20until%3A2014-04-17&src=typd&lang=ko'
browser.get(url)
time.sleep(3)
ended = False
try:
WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".tweet-text"))
)
except:
print("Can't find tweet text.")
tweetCount = int(browser.execute_script("return document.querySelectorAll('.tweet-text').length"))
retryCount = 0
startTime = time.time()
sleepTime=1
while not ended:
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(sleepTime)
timeX= time.time()
newCount = int(browser.execute_script("return document.querySelectorAll('.tweet-text').length"));
timeY=time.time()
if(newCount > tweetCount):
tweetCount = newCount
retryCount = 0
print("Count: " + str(tweetCount))
else:
retryCount = retryCount + 1
if(retryCount>=5):
ended = True
retryCount = 0
endScrollDownTime = time.time()
tweets = browser.find_elements_by_css_selector('.tweet-text')
wfile = open("140416.txt", mode='w', encoding='utf8')
data={}
i = 1
for i, tweet in enumerate(tweets):
data['text'] = tweet.text
wfile.write(str(data) +'\n')
i += 1
endGetTextTime = time.time()
time1 = endScrollDownTime-startTime
time2 = endGetTextTime - endScrollDownTime
overAll = endGetTextTime - startTime
print('\n\nLength: '+str(len(tweets))+ '\nScrolldown Time:' + str(time1) + '\nGetText Time: ' + str(time2) + '\nOverall: ' + str(overAll))
browser.quit()