嗨,我正尝试使用beautifulsoup从表中抓取数据,并且该表位于script标记内,并且每个td都与数据绑定绑定。请帮助我如何完成我尝试并搜索了多次但失败的任务。我是新手,请通过解决此问题来帮助我。
<script type="text/html" id="searchResultTemplate">
<table class="searchResultTable">
<thead>
<tr>
<td data-bind="click: function (data, event) { setSortInfo(SortInfoArray()[0]) }">Given Name</td>
<td data-bind="click: function (data, event) { setSortInfo(SortInfoArray()[1]) }">Family Name</td>
<td data-bind="click: function (data, event) { setSortInfo(SortInfoArray()[2]) }">Business</td>
<td data-bind="click: function (data, event) { setSortInfo(SortInfoArray()[3]) }">Suburb</td>
<td data-bind="click: function (data, event) { setSortInfo(SortInfoArray()[4]) }">State</td>
<td data-bind="click: function (data, event) { setSortInfo(SortInfoArray()[5]) }">Country</td>
<td>Map</td>
</tr>
</thead>
<tbody data-bind="template: { name: 'agentItemTemplate', foreach: Result }">
</tbody>
</table> </script>
我运行了以下代码以抓取上方的表格数据。
import bs4
from urllib.request import urlopen as uRequest
from bs4 import BeautifulSoup as soup
my_url = 'https://www.mara.gov.au/search-the-register-of-migration-agents/'
# opening up connection, grabing the page
uClient = uRequest(my_url)
# reading whole html of the page into variable
page_html=uClient.read()
#closing the connection with page
uClient.close()
# parsing the html page into a variable
page_soup=soup(my_url,"html.parser")
# to view h1 tags page_soup.h1
# create list of all divs having class item-container
table=page_soup.findAll("table",{"class":"searchResultTable"})
rows=table.findAll('tr')
if len(rows)>0:
for row in rows:
print(row)
答案 0 :(得分:0)
网站使用api检索数据。
import requests , csv , os
def SaveAsCsv(list_of_rows):
try:
with open('data.csv', mode='a', newline='', encoding='utf-8') as outfile:
csv.writer(outfile).writerow(list_of_rows)
print(" saved successully\n")
except PermissionError:
print("Please make sure data.csv is closed\n")
def Search(location):
url = 'https://www.mara.gov.au/api/agentsearch'
head = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en,en-US;q=0.9,ar;q=0.8',
'Content-Type': 'application/json; charset=utf-8',
'Host': 'www.mara.gov.au',
'Referer': 'https://www.mara.gov.au/search-the-register-of-migration-agents/',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
parmters = {
'DelimitedStartWithLetterFilter[FieldName]': 'DisplayBusiness.Name',
'DelimitedStartWithLetterFilter[LetterString]': '',
'DelimitedStartWithLetterFilter[Label]': 'Show+All',
'DelimitedStartWithLetterFilter[IsSelected]': 'false',
'DelimitedStartWithLetterFilter[ContainsData]': 'true',
'Keyword': '',
'Location': location,
'BusinessName': '',
'IsNoFee': '',
'IsPractitioner': '',
'AgentFamilyName': '',
'AgentGivenName': '',
'AgentName': '',
'AgentMARN': '',
'SortInfo[SortField]': '',
'SortInfo[IsAscending]': 'false',
'PagingInfo[PageIndex]': 0,
'PagingInfo[PageSize]': 20
}
res = requests.get(url,headers=head,params=parmters)
if res.status_code == 200 :
return res.json()
def Extract():
data = Search('AUSTRALIA') # Pass in the location word
for row in data['Result']:
first_name = row['Name']['GivenName']
family_name = row['Name']['FamilyName']
business_name = row['PrimaryBusiness']['Name']
Suburb = row['PrimaryBusiness']['Address']['Suburb']
State = row['PrimaryBusiness']['Address']['State']
country = row['DisplayBusiness']['Address']['Country']
full_address = row['DisplayBusiness']['Address']['FullAddress']
SaveAsCsv([first_name,family_name,business_name,Suburb,State,country,full_address])
if os.path.isfile('data.csv') and os.access('data.csv', os.R_OK):
print("File data.csv Already exists \n")
else:
SaveAsCsv([ 'Given Name','Family Name','Business Name','Suburb','State','country','full_address'])
Extract()