import requests
from requests import get
from selenium import webdriver
from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
#import chromedriver_binary # Adds chromedriver binary to path
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"C:\Users\mmanenica\Documents\chromedriver.exe")
#click the search button on Austenders to return all Awarded Contracts
import time
#define the starting point: Austenders Awarded Contracts search page
driver.get('https://www.tenders.gov.au/cn/search')
#Find the Search Button and return all search results
Search_Results = driver.find_element_by_name("SearchButton")
if 'inactive' in Search_Results.get_attribute('name'):
print("Search Button not found")
exit;
print('Search Button found')
Search_Results.click()
#Pause code to prevent blocking by website
time.sleep(1)
i = 0
Awarded = []
#Move to the next search page by finding the Next button at the bottom of the page
#This code will need to be refined as the last search will be skipped currently.
while True:
Next_Page = driver.find_element_by_class_name('next')
if 'inactive' in Next_Page.get_attribute('class'):
print("End of Search Results")
exit;
i = i + 1
time.sleep(2)
#Loop through all the Detail links on the current Search Results Page
print("Checking search results page " + str(i))
print(driver.current_url)
soup = BeautifulSoup(driver.current_url, features='lxml')
#Find all Contract detail links in the current search results page
Details = soup.findAll('div', {'class': 'list-desc-inner'})
for each_Contract in Details:
#Loop through each Contract details link and scrape all the detailed
#Contract information page
Details_Page = each_Contract.find('a', {'class': 'detail'}).get('href')
driver.get(Details_Page)
#Scrape all the data in the Awarded Contract page
#r = requests.get(driver.current_url)
soup = BeautifulSoup(driver.current_url, features='lxml')
#find a list of all the Contract Info (contained in the the 'Contact Heading'
#class of the span element)
Contract = soup.find_all('span', {'class': 'Contact-Heading'})
Contract_Info = [span.get_text() for span in Contract]
#find a list of all the Summary Contract info which is in the text of\
#the 'list_desc_inner' class
Sub = soup.find_all('div', {'class': 'list_desc_inner'})
Sub_Info = [div.get_text() for div in Sub]
#Combine the lists into a unified list and append to the Awarded table
Combined = [Contract_Info, Sub_Info]
Awarded.append[Combined]
#Go back to the Search Results page (from the Detailed Contract page)
driver.back()
#Go to the next Search Page by clicking on the Next button at the bottom of the page
Next_Page.click()
#
time.sleep(3)
print(Awarded.Shape)
答案 0 :(得分:0)
如上所述,您实际上并没有将html源代码输入到BeautifulSoup中。因此,第一件事改变了:从soup = BeautifulSoup(driver.current_url, features='lxml')
到soup = BeautifulSoup(driver.page_source, features='lxml')
第二个问题:某些元素没有带有class = detail的标签<a>
。因此,您将无法从NoneType获取href。我添加了一个try / except来跳过这种情况(虽然不确定是否能提供您想要的结果)。您也可以摆脱该类,只说Details_Page = each_Contract.find('a').get('href')
接下来,这只是URL的扩展名,您需要附加根目录,因此:driver.get('https://www.tenders.gov.au' + Details_Page)
我也看不到您指的是class = Contact-Heading。
您还可以参考class ='class':'list-desc-inner'和一个点,然后参考'class':'list_desc_inner'。同样,我看不到class = list_desc_inner
下一步。要将列表追加到列表,您需要Awarded.append(Combined)
,而不是Awarded.append[Combined]
我还在那里添加了.strip()
,以清理文本中的某些空白。
无论如何,您需要修复和清理大量内容,而且我也不知道您的预期输出应该是多少。希望这可以帮助您入门。
此外,如评论中所述,您可以单击下载按钮即可直接获得结果,但也许您正在用这种困难的方式练习...
import requests
from requests import get
from selenium import webdriver
from bs4 import BeautifulSoup
from lxml import html
import pandas as pd
#import chromedriver_binary # Adds chromedriver binary to path
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
#click the search button on Austenders to return all Awarded Contracts
import time
#define the starting point: Austenders Awarded Contracts search page
driver.get('https://www.tenders.gov.au/cn/search')
#Find the Search Button and return all search results
Search_Results = driver.find_element_by_name("SearchButton")
if 'inactive' in Search_Results.get_attribute('name'):
print("Search Button not found")
exit;
print('Search Button found')
Search_Results.click()
#Pause code to prevent blocking by website
time.sleep(1)
i = 0
Awarded = []
#Move to the next search page by finding the Next button at the bottom of the page
#This code will need to be refined as the last search will be skipped currently.
while True:
Next_Page = driver.find_element_by_class_name('next')
if 'inactive' in Next_Page.get_attribute('class'):
print("End of Search Results")
exit;
i = i + 1
time.sleep(2)
#Loop through all the Detail links on the current Search Results Page
print("Checking search results page " + str(i))
print(driver.current_url)
soup = BeautifulSoup(driver.page_source, features='lxml')
#Find all Contract detail links in the current search results page
Details = soup.findAll('div', {'class': 'list-desc-inner'})
for each_Contract in Details:
#Loop through each Contract details link and scrape all the detailed
#Contract information page
try:
Details_Page = each_Contract.find('a', {'class': 'detail'}).get('href')
driver.get('https://www.tenders.gov.au' + Details_Page)
#Scrape all the data in the Awarded Contract page
#r = requests.get(driver.current_url)
soup = BeautifulSoup(driver.page_source, features='lxml')
#find a list of all the Contract Info (contained in the the 'Contact Heading'
#class of the span element)
Contract = soup.find_all('span', {'class': 'Contact-Heading'})
Contract_Info = [span.text.strip() for span in Contract]
#find a list of all the Summary Contract info which is in the text of\
#the 'list_desc_inner' class
Sub = soup.find_all('div', {'class': 'list-desc-inner'})
Sub_Info = [div.text.strip() for div in Sub]
#Combine the lists into a unified list and append to the Awarded table
Combined = [Contract_Info, Sub_Info]
Awarded.append(Combined)
#Go back to the Search Results page (from the Detailed Contract page)
driver.back()
except:
continue
#Go to the next Search Page by clicking on the Next button at the bottom of the page
Next_Page.click()
#
time.sleep(3)
driver.close()
print(Awarded.Shape)