我想抓取桑坦德网站。但是我使用的代码无法正常工作,因为当我运行两次时,我得到的金额会有所不同。
我当前的代码运行不正常。
除我得到错误的结果外,抓取似乎可行。而且,当我连续两次运行代码时,结果就会改变。
我的代码:
def hw_santander_scrap(Amount, Duration):
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('window-size=10000x5000')
webdriver = webdriver.Chrome('chromedriver', chrome_options = chrome_options)
#
import time
maintenant = DT.now()
period = str(maintenant.day) + '_' + str(maintenant.month) + '_' + str(maintenant.year)
print('Start Scraping')
################################################ Santander###############################################
Santander = pd.DataFrame({
'Project': "reforma vivienda",
'Period': period,
'Monthly repayment': [0],
'TIN': [0],
'TAE': [0],
'Total repayment': [0],
'Initial amount': [0],
'Duration': [0]
})
project = pd.DataFrame({
'Project': "reforma vivienda",
'Period': period,
'Monthly repayment': [0],
'TIN': [0],
'TAE': [0],
'Total repayment': [0],
'Initial amount': [0],
'Duration': [0]
})
url = 'https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0#/t0'
webdriver.get(url)
Max_amount = 90.000
Min_amount = 3.000
for i in range(len(Amount)):
Simulated_amount = Amount[i]
if Simulated_amount > Max_amount:
pass
elif Simulated_amount < Min_amount:
pass
else :
amount = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#amount")))
amount.clear()
amount.send_keys("{:.3f}".format(Simulated_amount))
WebDriverWait(webdriver, 30).until(lambda webdriver: webdriver.execute_script('return jQuery.active') == 0)
for j in range(len(Duration)):
Simulated_duration = Duration[j]
Simulated_duration = round(int(Simulated_duration))
Max_duration = 96
Min_duration = 12
if Simulated_duration > Max_duration:
pass
elif Simulated_duration < Min_duration:
pass
else :
term = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#term")))
term.clear()
term.send_keys("{}".format(Simulated_duration))
term.send_keys(Keys.TAB)
perform_term = webdriver.find_element_by_xpath("//span[@class='ui-slider-handle ui-state-default ui-corner-all']")
actions = ActionChains(webdriver).click(perform_term)
actions.perform()
webdriver.save_screenshot('screenshot_santander.png')
currentfee = webdriver.find_element_by_css_selector('.r1 span').text
WebDriverWait(webdriver, 10).until(lambda webdriver: webdriver.execute_script('return jQuery.active') == 0)
webdriver.save_screenshot('screenshot_santander.png')
project.loc[j, 'Project'] = "reforma vivienda"
project.loc[j, 'Initial amount'] = float("{:.3f}".format(Amount[i]).replace('.', ''))
project.loc[j, 'Duration'] = Simulated_duration
project.loc[j, 'Period'] = str(maintenant.day) + '/' + str(maintenant.month) + '/' + str(maintenant.year)
project.loc[j, 'Monthly repayment'] = webdriver.find_element_by_css_selector('.r1 span').text.replace(' €', '').replace(',', '.')
project.loc[j, 'TIN'] = float(webdriver.find_element_by_css_selector('.r3 span').text[6: 10].replace(',', '.'))
project.loc[j, 'TAE'] = float(webdriver.find_element_by_css_selector('.r3 span').text[13: 17].replace(',', '.'))
project.loc[j, 'Total repayment'] = float(webdriver.find_element_by_css_selector('.r7 span').text.replace(' €', '').replace('.', '').replace(',', '.'))
Santander = Santander.append(project)
Santander = Santander.loc[Santander.TIN != 0,: ]
Santander.to_csv('Santander_{}.csv'.format(period), index = False)
print('End Scraping')
For run the code:
Amount = [13.000, 14.000, 15.000, 30.000, 45.000, 60.000]
Duration = [12, 15, 24, 36, 48, 60, 72, 84, 96]
hw_santander_scrap(Amount, Duration)
答案 0 :(得分:4)
以下是使用requests
的建议:
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
#Let's first collect few auth vars
r = requests.Session()
response = r.get("https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0#/t0")
soup = BeautifulSoup(response.content, 'html')
key = soup.find_all('script',text=re.compile('Afi.AfiAuth.Init'))
pattern = r"Afi.AfiAuth.Init\((.*?)\)"
WSSignature = re.findall(pattern,key[0].text)[0].split(',')[-1].replace('\'','')
WSDateTime = re.findall(pattern,key[0].text)[0].split(',')[1].replace('\'','')
headers = {
'Origin': 'https://simuladores.bancosantander.es',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Content-Type': 'application/json;charset=UTF-8',
'Accept': 'application/json, text/plain, */*',
'WSSignature': WSSignature,
'Referer': 'https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0',
'WSDateTime': WSDateTime,
'WSClientCode': 'SantanderES',
}
#Those are the standard params of a request
params = {'wsInputs': {'finality': 'Renueva tu hogar',
'productCode': 'p300',
'capitalOrInstallment': 12000,
'monthsTerm': 96,
'mothsInitialTerm': 0,
'openingCommission': 1.5,
'minOpeningCommission': 0,
'financeOpeningCommission': True,
'interestRate': 0,
'interestRateReferenceIndex': 0,
'interestRateSecondaryReferenceIndex': 0,
'interestRateSecondaryWithoutVinculation': 5.95,
'interestRateSecondaryWithAllVinculation': 0,
'interestRateSecondary': 5.95,
'loanDate': '2019-06-12',
'birthDate': '2001-06-12',
'financeLoanProtectionInsurance': True,
'percentageNotaryCosts': 0.003,
'loanCalculationMethod': 0,
'calculationBase': 4,
'frecuencyAmortization': 12,
'frecuencyInterestPay': 12,
'calendarConvention': 0,
'taeCalculationBaseType': 4,
'lackMode': 0,
'amortizationCarencyMonths': 0,
'typeAmortization': 1,
'insuranceCostSinglePremium': 0,
'with123': False,
'electricVehicle': False}}
#The scraping function
def scrap(amount, duration, params):
params['wsInputs']['capitalOrInstallment'] = amount
params['wsInputs']['monthsTerm'] = duration
response = r.post('https://simuladores.bancosantander.es/WS/WSSantanderTotalLoan.asmx/Calculate', headers=headers, data=json.dumps(params))
return json.loads(response.content)['d']
Amounts = [13000, 14000, 15000, 30000, 45000, 60000]
Durations = [12, 15, 24, 36, 48, 60, 72, 84, 96]
results = []
for amount in Amounts:
for duration in Durations:
result = scrap(amount, duration, params)
result['Amount'] = amount
result['Duration'] = duration
results.append(result)
df = pd.DataFrame(results)
以下是您可以直接获取的信息示例:
{ ....
'TotalCosts': 0,
'CapitalOrInstallment': 1135.3433231340491,
'Disclaimer': '',
'Capital': 13195,
'ThereIsLackAtFirstStep': False,
'ThereIsLackAtSecondStep': False,
'InstallmentFirstStepLackPeriod': 1135.3433231340491,
'InstallmentSecondStepLackPeriod': 1135.3433231340491,
'Installment': 1135.3433231340491,
'InstallmentFirstStep': 1135.3433231340491,
'InstallmentSecondStep': 1135.3433231340491,
'CommissionOpeningCosts': 195.00000000000003,
'TAE': 9.1,
'TAEWithoutVinculation': 9.1,
....
}
编辑1:
添加了获取两个重要变量WSSignature
,WSDateTime
并将其插入标头的代码
编辑2: 参数可以适应不同类型的贷款:
#coaches
params_coaches = {'wsInputs': {'finality': 'Vehículo con hasta 36 meses de antigüedad',
'productCode': 'p100',
'capitalOrInstallment': 5000,
'monthsTerm': 96,
'mothsInitialTerm': 12,
'openingCommission': 1.5,
'minOpeningCommission': 60,
'financeOpeningCommission': True,
'interestRate': 5.5,
'interestRateReferenceIndex': 0,
'interestRateSecondaryReferenceIndex': 0,
'interestRateSecondaryWithoutVinculation': 6.5,
'interestRateSecondaryWithAllVinculation': 0,
'interestRateSecondary': 6.5,
'loanDate': '2019-06-13',
'birthDate': '2001-06-13',
'financeLoanProtectionInsurance': True,
'percentageNotaryCosts': 0.003,
'loanCalculationMethod': 0,
'calculationBase': 4,
'frecuencyAmortization': 12,
'frecuencyInterestPay': 12,
'calendarConvention': 0,
'taeCalculationBaseType': 4,
'lackMode': 0,
'amortizationCarencyMonths': 0,
'typeAmortization': 1,
'insuranceCostSinglePremium': 0,
'with123': False,
'electricVehicle': False}}
#proyectos
params_proyectos = {'wsInputs': {'finality': 'Tus proyectos',
'productCode': 'p200',
'capitalOrInstallment': 6000,
'monthsTerm': 96,
'mothsInitialTerm': 0,
'openingCommission': 1.5,
'minOpeningCommission': 60,
'financeOpeningCommission': True,
'interestRate': 0,
'interestRateReferenceIndex': 0,
'interestRateSecondaryReferenceIndex': 0,
'interestRateSecondaryWithoutVinculation': 5.95,
'interestRateSecondaryWithAllVinculation': 0,
'interestRateSecondary': 5.95,
'loanDate': '2019-06-13',
'birthDate': '2001-06-13',
'financeLoanProtectionInsurance': True,
'percentageNotaryCosts': 0.003,
'loanCalculationMethod': 0,
'calculationBase': 4,
'frecuencyAmortization': 12,
'frecuencyInterestPay': 12,
'calendarConvention': 0,
'taeCalculationBaseType': 4,
'lackMode': 0,
'amortizationCarencyMonths': 0,
'typeAmortization': 1,
'insuranceCostSinglePremium': 0,
'with123': False,
'electricVehicle': False}}
调用该函数时,只需选择正确的一个即可。以前:
result = scrap(amount, duration, params)
#or
result = scrap(amount, duration, params_coaches)
#or
result = scrap(amount, duration, params_proyectos)