我正在尝试使用python并抓取网站https://ipinfo.io/countries/来获取以下格式的数据。这不过是ASN数据。
下面的特定信息可以在(https://ipinfo.io/AS3320)
中找到{
3320: {
"country": "DE",
"name": "Deutsche Telekom AG",
"registry": "ripe",
"num_ip_addresses": 34756096
},
我编写了一个程序,该程序可以获取所有国家/地区,然后将其映射回ASN ID-> https://ipinfo.io/AS3320但是,我很难使用Beautiful汤解析所有页面数据。
import urllib.request
import bs4
import re
import json
url = 'https://ipinfo.io/countries'
SITE = 'https://ipinfo.io'
def url_to_soup(url):
req = urllib.request.Request(url)
opener = urllib.request.build_opener()
html = opener.open(req)
soup = bs4.BeautifulSoup(html, "html.parser")
return soup
def find_pages(page):
pages = []
for link in page.find_all(href=re.compile('/countries/')):
pages.append(link.get('href'))
return pages
def scrape_pages(links):
mappings = {}
print("Scraping Pages for ASN Data...")
for link in links:
country_page = url_to_soup(SITE + link)
current_country = link.split('/')[2]
for row in country_page.find_all('tr'):
columns = row.find_all('td')
if len(columns) > 0:
#print(columns)
current_asn = re.findall(r'\d+', columns[0].string)[0]
#print("/AS"+current_asn)
#"Recursively I am generating the URL's using above "
abc = url_to_soup(SITE + '/AS' + current_asn)
print(abc.find_all('div'))
#THe above code is where I am stuck with
# How to get the country, name, registry, Ip_address
#name = columns[1].string
#print(name)
"""routes_v4 = columns[3].string
routes_v6 = columns[5].string
mappings[current_asn] = {'Country': current_country,
'Name': name,
'Registry': routes_v4,
'num_ip_addresses': routes_v6}
return mappings"""
main_page = url_to_soup(url)
country_links = find_pages(main_page)
#print(country_links)
asn_mappings = scrape_pages(country_links)
#print(asn_mappings)
结果是包含所有信息的HTML页面的大量转储,但是我无法根据需要对其进行过滤。
#编辑1:
#import urllib.request
import bs4
import re
url ='https://ipinfo.io/AS7018'
def url_to_soup(url):
req = urllib.request.Request(url)
opener = urllib.request.build_opener()
html = opener.open(req)
soup = bs4.BeautifulSoup(html, "html.parser")
return soup
s = str(url_to_soup(url))
asn_code, name = re.search(r'<h3 class="font-semibold m-0 t-xs-24">(?P<ASN_CODE>AS\d+) (?P<NAME>[\w.\s]+)</h3>', s).groups()
print(asn_code)
country = re.search(r'.*href="/countries.*">(?P<COUNTRY>.*)?</a>',s).group("COUNTRY")
print(country)
registry = re.search(r'Registry.*?pb-md-1">(?P<REGISTRY>.*?)</p>',s, re.S).group("REGISTRY").strip()
print(registry)
# flag re.S make the '.' special character match any character at all, including a newline;
ip = re.search(r'IP Addresses.*?pb-md-1">(?P<IP>.*?)</p>',s, re.S).group("IP").strip()
print(ip)
答案 0 :(得分:0)
1 https://ipinfo.io/countries/
代表all countries
,
2 https://ipinfo.io/countries/us
代表us all ASN
,
和3 https://ipinfo.io/AS26611
代表detail ASN data
。
似乎您的代码中缺少第3步,如果您获得了详细的ASN数据页面的页面资源,则可以使用以下正则表达式作为参考:
s = """ your ASN detail data page """
asn_code, name = re.search(r'<h3 class="font-semibold m-0 t-xs-24">(?P<ASN_CODE>AS\d+) (?P<NAME>.*?)</h3>',s).groups()
country = re.search(r'.*href="/countries.*">(?P<COUNTRY>.*)?</a>',s).group("COUNTRY")
registry = re.search(r'Registry.*?pb-md-1">(?P<REGISTRY>.*?)</p>',s, re.S).group("REGISTRY").strip()
# flag re.S make the '.' special character match any character at all, including a newline;
ip = re.search(r'IP Addresses.*?pb-md-1">(?P<IP>.*?)</p>',s, re.S).group("IP").strip()
requests-html
模块,以有效地请求页面并解析html(比起正则表达式更有效的CSS选择器和Xpath选择器)。希望它有用