我正在尝试浏览https://bgp.he.net/report/world中的表格。我想遍历到国家页面的每个HTML链接,然后获取数据,然后迭代到下一个列表。我正在使用漂亮的汤,并且已经可以获取所需的数据,但是还不太清楚如何遍历HTML列。
from bs4 import BeautifulSoup
import requests
import json
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/country/LC"
html = requests.get(url, headers=headers)
country_ID = (url[-2:])
print("\n")
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup)
data = []
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
i = 0
with open ('table_attempt.txt', 'w') as r:
for item in data:
r.write(str(data[i]))
i += 1
r.write("\n")
print(data)
我希望能够将每个国家/地区的数据收集到一个书面文件中。
答案 0 :(得分:0)
我仅使用前3个链接进行了测试(使用UnicodeEncodeError遇到了一个错误,但已将其修复并注释了代码中的位置)。
from bs4 import BeautifulSoup
import requests
import json
#First get the list of countries urls
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/report/world"
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
table = soup.find('table', {'id':'table_countries'})
rows = table.find_all('tr')
country_urls = []
# Go through each row and grab the link. If there's no link, continue to next row
for row in rows:
try:
link = row.select('a')[0]['href']
country_urls.append(link)
except:
continue
# Now iterate through that list
for link in country_urls:
url = "https://bgp.he.net" + link
html = requests.get(url, headers=headers)
country_ID = (url[-2:])
print("\n")
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup)
data = []
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
i = 0
print ('Writing from %s' %(url))
# I added encoding="utf-8" because of an UnicodeEncodeError:
with open ('table_attempt.txt', 'w', encoding="utf-8") as r:
for item in data:
r.write(str(data[i]))
i += 1
r.write("\n")
答案 1 :(得分:0)
您可以遍历主表,并发送一个请求以刮除“报告”列表:
import requests, re
from bs4 import BeautifulSoup as soup
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
def scrape_report(_id):
_d = soup(requests.get(f'https://bgp.he.net/country/{_id}', headers=headers).text, 'html.parser')
_headers = [i.text for i in _d.find_all('th')]
_, *data = [[i.text for i in b.find_all('td')] for b in _d.find_all('tr')]
return [dict(zip(_headers, i)) for i in data]
d = soup(requests.get('https://bgp.he.net/report/world', headers=headers).text, 'html.parser')
_, *_listings = [[re.sub('[\t\n]+', '', i.text) for i in b.find_all('td')] for b in d.find_all('tr')]
final_result = [{**dict(zip(['Name', 'Country', 'ASN'], [a, b, c])), 'data':scrape_report(b)} for a, b, c, *_ in _listings]
答案 2 :(得分:0)
import requests
import json
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/report/world"
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
#sorting through table
table = soup.find('table', {'id':'table_countries'})
rows = table.find_all('tr')
country_urls = []
#Grabbing urls from table
for row in rows:
try:
link = row.select('a')[0]['href']
country_urls.append(link)
except:
continue
Total_URLs= len(country_urls)
print(Total_URLs, "counties to pull data from")
print("\n")
#Creating text file
with open('table_attempt.txt', 'w', encoding="utf-8") as r:
json.dumps([])
#Looping through country url list
for link in country_urls:
url = "https://bgp.he.net" + link
html = requests.get(url, headers=headers)
#Taking country identifier from url list
country_ID = (url[-2:])
soup = BeautifulSoup(html.text, 'html.parser')
data = []
i=0
Total_URLs -= 1
#appending to file
with open('ASN_Info.txt', 'a', encoding="utf-8") as r:
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
json.dump(data[i], r)
i += 1
r.write("\n")
print('Currently writing from data from %s. %s countries left to pull data from.' %(country_ID, Total_URLs))