我从前有这个疑问,所以我决定进行测试。
我正在从该网站https://www.bettingtips1x2.com/抓取一个匹配的行
我的两种方法的脚本如下
def GetSoupData(nodo):
boton=nodo.find_element_by_xpath('.//a[contains(@href,"betting-tips")]')
source=nodo.get_attribute('outerHTML')
soup=BeautifulSoup(source,"lxml")
liga,partido,hora=None,None,None
odds=list()
for row in soup.find_all('td'):
#print(row.attrs.keys())
if not row.attrs:
try:
#print(row.find('a')['href'])
partido=row.get_text()
except:
hora=row.get_text()
elif 'class' in row.attrs and 'align' not in row.attrs:
liga=row.get_text()
elif 'align' in row.attrs and 'class' in row.attrs:
try:
odds.append(row.get_text())
except: continue
print(liga,hora,partido,odds)
def GetScrap(nodo):
boton=nodo.find_element_by_xpath('.//a[contains(@href,"betting-tips")]')
odds=[a.text for a in nodo.find_elements_by_xpath('.//td[@align and @class]')]
partido=boton.text
liga=nodo.find_element_by_xpath('.//td[@class and not(@align)]').text
hora=nodo.find_element_by_xpath('.//td[not(@*) and not(.//*)]').text
print(liga,hora,partido,odds)
主要脚本如下
import Log
import time
from Inicializador import IniciaWeb
from NextNode import GetNext
from Data import GetSoupData, GetScrap
Log.init()
browser=IniciaWeb()
a=time.time()
nodo=GetNext(browser)
b=time.time()
GetSoupData(nodo)
c=time.time()
GetScrap(nodo)
d=time.time()
browser.quit()
print('b-a',b-a)
print('c-b',c-b)
print('d-c',d-c)
我得到的是
>b-a 0.014959096908569336
>c-b 0.017953872680664062
>d-c 0.12560749053955078
这意味着使用BSoup大约比原始刮节点快10倍。实际上,这让我感到惊讶,因为我认为中介越少越好。我在考试中遗漏了什么吗?可以进一步优化吗?