在Python中使用Selenium进行爬取

时间:2019-06-12 08:46:53

标签: python selenium selenium-webdriver web-scraping

我想抓取桑坦德网站。但是我使用的代码无法正常工作,因为当我运行两次时,我得到的金额会有所不同。

我当前的代码运行不正常。

除我得到错误的结果外,抓取似乎可行。而且,当我连续两次运行代码时,结果就会改变。

该站点的链接如下:https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0#/t0

我的代码:

def hw_santander_scrap(Amount, Duration):
      from selenium import webdriver
      from selenium.webdriver.support.ui import WebDriverWait
      from selenium.webdriver.support import expected_conditions as EC
      chrome_options = webdriver.ChromeOptions()
      chrome_options.add_argument('--headless')
      chrome_options.add_argument('--no-sandbox')
      chrome_options.add_argument('--disable-dev-shm-usage')
      chrome_options.add_argument('--start-maximized')
      chrome_options.add_argument('window-size=10000x5000')
      webdriver = webdriver.Chrome('chromedriver', chrome_options = chrome_options)

      #
      import time
      maintenant = DT.now()
      period = str(maintenant.day) + '_' + str(maintenant.month) + '_' + str(maintenant.year)
      print('Start Scraping')

      ################################################ Santander###############################################

      Santander = pd.DataFrame({
          'Project': "reforma vivienda",
          'Period': period,
          'Monthly repayment': [0],
          'TIN': [0],
          'TAE': [0],
          'Total repayment': [0],
          'Initial amount': [0],
          'Duration': [0]
      })

      project = pd.DataFrame({
          'Project': "reforma vivienda",
          'Period': period,
          'Monthly repayment': [0],
          'TIN': [0],
          'TAE': [0],
          'Total repayment': [0],
          'Initial amount': [0],
          'Duration': [0]
      })
      url = 'https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0#/t0'
      webdriver.get(url)

      Max_amount = 90.000
      Min_amount = 3.000
      for i in range(len(Amount)):
        Simulated_amount = Amount[i]
        if Simulated_amount > Max_amount:
          pass
        elif Simulated_amount < Min_amount:
          pass
        else :
          amount = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#amount")))
          amount.clear()
          amount.send_keys("{:.3f}".format(Simulated_amount))
          WebDriverWait(webdriver, 30).until(lambda webdriver: webdriver.execute_script('return jQuery.active') == 0)
          for j in range(len(Duration)):
            Simulated_duration = Duration[j]
            Simulated_duration = round(int(Simulated_duration))
            Max_duration = 96
            Min_duration = 12
            if Simulated_duration > Max_duration:
              pass
            elif Simulated_duration < Min_duration:
              pass
            else :
              term = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#term")))
            term.clear()
            term.send_keys("{}".format(Simulated_duration))
            term.send_keys(Keys.TAB)
            perform_term = webdriver.find_element_by_xpath("//span[@class='ui-slider-handle ui-state-default ui-corner-all']")
            actions = ActionChains(webdriver).click(perform_term)
            actions.perform()
            webdriver.save_screenshot('screenshot_santander.png')
            currentfee = webdriver.find_element_by_css_selector('.r1 span').text

            WebDriverWait(webdriver, 10).until(lambda webdriver: webdriver.execute_script('return jQuery.active') == 0)
            webdriver.save_screenshot('screenshot_santander.png')
            project.loc[j, 'Project'] = "reforma vivienda"
            project.loc[j, 'Initial amount'] = float("{:.3f}".format(Amount[i]).replace('.', ''))
            project.loc[j, 'Duration'] = Simulated_duration
            project.loc[j, 'Period'] = str(maintenant.day) + '/' + str(maintenant.month) + '/' + str(maintenant.year)
            project.loc[j, 'Monthly repayment'] = webdriver.find_element_by_css_selector('.r1 span').text.replace(' €', '').replace(',', '.')
            project.loc[j, 'TIN'] = float(webdriver.find_element_by_css_selector('.r3 span').text[6: 10].replace(',', '.'))
            project.loc[j, 'TAE'] = float(webdriver.find_element_by_css_selector('.r3 span').text[13: 17].replace(',', '.'))
            project.loc[j, 'Total repayment'] = float(webdriver.find_element_by_css_selector('.r7 span').text.replace(' €', '').replace('.', '').replace(',', '.'))
          Santander = Santander.append(project)
      Santander = Santander.loc[Santander.TIN != 0,: ]
      Santander.to_csv('Santander_{}.csv'.format(period), index = False)
    print('End Scraping')

For run the code:

    Amount = [13.000, 14.000, 15.000, 30.000, 45.000, 60.000]
    Duration = [12, 15, 24, 36, 48, 60, 72, 84, 96]
    hw_santander_scrap(Amount, Duration)

1 个答案:

答案 0 :(得分:4)

以下是使用requests的建议:

import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd

#Let's first collect few auth vars
r = requests.Session()
response = r.get("https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0#/t0")
soup = BeautifulSoup(response.content, 'html')
key = soup.find_all('script',text=re.compile('Afi.AfiAuth.Init'))
pattern = r"Afi.AfiAuth.Init\((.*?)\)"

WSSignature = re.findall(pattern,key[0].text)[0].split(',')[-1].replace('\'','')
WSDateTime = re.findall(pattern,key[0].text)[0].split(',')[1].replace('\'','')

headers = {
    'Origin': 'https://simuladores.bancosantander.es',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
    'Content-Type': 'application/json;charset=UTF-8',
    'Accept': 'application/json, text/plain, */*',
    'WSSignature': WSSignature,
    'Referer': 'https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0',
    'WSDateTime': WSDateTime,
    'WSClientCode': 'SantanderES',
}

#Those are the standard params of a request
params = {'wsInputs': {'finality': 'Renueva tu hogar',
  'productCode': 'p300',
  'capitalOrInstallment': 12000,
  'monthsTerm': 96,
  'mothsInitialTerm': 0,
  'openingCommission': 1.5,
  'minOpeningCommission': 0,
  'financeOpeningCommission': True,
  'interestRate': 0,
  'interestRateReferenceIndex': 0,
  'interestRateSecondaryReferenceIndex': 0,
  'interestRateSecondaryWithoutVinculation': 5.95,
  'interestRateSecondaryWithAllVinculation': 0,
  'interestRateSecondary': 5.95,
  'loanDate': '2019-06-12',
  'birthDate': '2001-06-12',
  'financeLoanProtectionInsurance': True,
  'percentageNotaryCosts': 0.003,
  'loanCalculationMethod': 0,
  'calculationBase': 4,
  'frecuencyAmortization': 12,
  'frecuencyInterestPay': 12,
  'calendarConvention': 0,
  'taeCalculationBaseType': 4,
  'lackMode': 0,
  'amortizationCarencyMonths': 0,
  'typeAmortization': 1,
  'insuranceCostSinglePremium': 0,
  'with123': False,
  'electricVehicle': False}}

#The scraping function
def scrap(amount, duration, params):

    params['wsInputs']['capitalOrInstallment'] = amount
    params['wsInputs']['monthsTerm'] = duration
    response = r.post('https://simuladores.bancosantander.es/WS/WSSantanderTotalLoan.asmx/Calculate', headers=headers, data=json.dumps(params))
    return json.loads(response.content)['d']


Amounts = [13000, 14000, 15000, 30000, 45000, 60000]
Durations = [12, 15, 24, 36, 48, 60, 72, 84, 96]
results = []
for amount in Amounts:
    for duration in Durations:
        result = scrap(amount, duration, params)
        result['Amount'] = amount
        result['Duration'] = duration
        results.append(result)

df = pd.DataFrame(results)

以下是您可以直接获取的信息示例:

{   ....

    'TotalCosts': 0,
     'CapitalOrInstallment': 1135.3433231340491,
     'Disclaimer': '',
     'Capital': 13195,
     'ThereIsLackAtFirstStep': False,
     'ThereIsLackAtSecondStep': False,
     'InstallmentFirstStepLackPeriod': 1135.3433231340491,
     'InstallmentSecondStepLackPeriod': 1135.3433231340491,
     'Installment': 1135.3433231340491,
     'InstallmentFirstStep': 1135.3433231340491,
     'InstallmentSecondStep': 1135.3433231340491,
     'CommissionOpeningCosts': 195.00000000000003,
     'TAE': 9.1,
     'TAEWithoutVinculation': 9.1,  
     ....
}

编辑1: 添加了获取两个重要变量WSSignatureWSDateTime并将其插入标头的代码

编辑2: 参数可以适应不同类型的贷款:

#coaches
params_coaches =  {'wsInputs': {'finality': 'Vehículo con hasta 36 meses de antigüedad',
  'productCode': 'p100',
  'capitalOrInstallment': 5000,
  'monthsTerm': 96,
  'mothsInitialTerm': 12,
  'openingCommission': 1.5,
  'minOpeningCommission': 60,
  'financeOpeningCommission': True,
  'interestRate': 5.5,
  'interestRateReferenceIndex': 0,
  'interestRateSecondaryReferenceIndex': 0,
  'interestRateSecondaryWithoutVinculation': 6.5,
  'interestRateSecondaryWithAllVinculation': 0,
  'interestRateSecondary': 6.5,
  'loanDate': '2019-06-13',
  'birthDate': '2001-06-13',
  'financeLoanProtectionInsurance': True,
  'percentageNotaryCosts': 0.003,
  'loanCalculationMethod': 0,
  'calculationBase': 4,
  'frecuencyAmortization': 12,
  'frecuencyInterestPay': 12,
  'calendarConvention': 0,
  'taeCalculationBaseType': 4,
  'lackMode': 0,
  'amortizationCarencyMonths': 0,
  'typeAmortization': 1,
  'insuranceCostSinglePremium': 0,
  'with123': False,
  'electricVehicle': False}}

#proyectos
params_proyectos = {'wsInputs': {'finality': 'Tus proyectos',
  'productCode': 'p200',
  'capitalOrInstallment': 6000,
  'monthsTerm': 96,
  'mothsInitialTerm': 0,
  'openingCommission': 1.5,
  'minOpeningCommission': 60,
  'financeOpeningCommission': True,
  'interestRate': 0,
  'interestRateReferenceIndex': 0,
  'interestRateSecondaryReferenceIndex': 0,
  'interestRateSecondaryWithoutVinculation': 5.95,
  'interestRateSecondaryWithAllVinculation': 0,
  'interestRateSecondary': 5.95,
  'loanDate': '2019-06-13',
  'birthDate': '2001-06-13',
  'financeLoanProtectionInsurance': True,
  'percentageNotaryCosts': 0.003,
  'loanCalculationMethod': 0,
  'calculationBase': 4,
  'frecuencyAmortization': 12,
  'frecuencyInterestPay': 12,
  'calendarConvention': 0,
  'taeCalculationBaseType': 4,
  'lackMode': 0,
  'amortizationCarencyMonths': 0,
  'typeAmortization': 1,
  'insuranceCostSinglePremium': 0,
  'with123': False,
  'electricVehicle': False}}

调用该函数时,只需选择正确的一个即可。以前:

result = scrap(amount, duration, params)
#or
result = scrap(amount, duration, params_coaches)
#or
result = scrap(amount, duration, params_proyectos)