我正在尝试创建一个刮板,以返回欧洲各机场之间每日航班的数据,以获取欧洲航空公司的列表。对于荷航,可通过单击地图上的点在以下网站上找到数据(数据显示在地图下的表格中): https://www.flightradar24.com/data/airlines/kl-klm/routes
我目前有以下代码:
import requests
import json
import datetime
import pandas as pd
myProxy = {"http" : "http://10.120.118.49:8080", "https" : "https://10.120.118.49:8080"}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0"}
eu_airports = ['AAL', 'AAR', 'ABZ', 'ACE', 'ADA', 'ADB', 'AER', 'AES', 'AGP', 'AHO', 'AJA', 'ALA', 'ALC', 'AMS',
'ANR', 'AOI', 'ARN', 'ATH', 'AYT', 'BCN', 'BDS', 'BEG', 'BER', 'BES', 'BFS', 'BGO', 'BGY', 'BHD',
'BHX', 'BIA', 'BIO', 'BIQ', 'BJV', 'BLL', 'BLQ', 'BMA', 'BOD', 'BOJ', 'BOO', 'BRE', 'BRI', 'BRN',
'BRQ', 'BRS', 'BRU', 'BTS', 'BUD', 'BVA', 'CAG', 'CDG', 'CFU', 'CGN', 'CHQ', 'CIA', 'CIY', 'CLJ',
'CPH', 'CRL', 'CTA', 'CWL', 'DBV', 'DEB', 'DLM', 'DME', 'DRS', 'DTM', 'DUB', 'DUS', 'EDI', 'EGC',
'EIN', 'EMA', 'ESB', 'EVN', 'FAO', 'FCO', 'FDH', 'FKB', 'FLR', 'FMM', 'FMO', 'FNC', 'FRA', 'FSC',
'FUE', 'GDN', 'GLA', 'GOA', 'GOT', 'GRO', 'GRQ', 'GRZ', 'GVA', 'GYD', 'HAJ', 'HAM', 'HAU', 'HEL',
'HER', 'HHN', 'HUY', 'IAS', 'IBZ', 'IEV', 'INI', 'INN', 'IST', 'JER', 'JMK', 'JTR', 'KBP', 'KEF',
'KGS', 'KIR', 'KIV', 'KLU', 'KRK', 'KRS', 'KTW', 'KUN', 'LBA', 'LCA', 'LCY', 'LED', 'LEI', 'LEJ',
'LGG', 'LGW', 'LHR', 'LIL', 'LIN', 'LIS', 'LJU', 'LNZ', 'LPA', 'LPL', 'LTN', 'LUG', 'LUX', 'LYS',
'MAD', 'MAH', 'MAN', 'MJV', 'MLA', 'MMX', 'MPL', 'MRS', 'MSQ', 'MST', 'MUC', 'MXP', 'NAP', 'NCE',
'NCL', 'NOC', 'NRN', 'NTE', 'NUE', 'NYO', 'ODS', 'OLB', 'OPO', 'ORK', 'ORY', 'OSL', 'OST', 'OTP',
'OUL', 'PAD', 'PDL', 'PEG', 'PFO', 'PIK', 'PMI', 'PMO', 'POZ', 'PRG', 'PRN', 'PSA', 'PSR', 'PUY',
'REU', 'RHO', 'RIX', 'RTM', 'RVN', 'SAW', 'SCQ', 'SDR', 'SEN', 'SJJ', 'SKG', 'SKP', 'SNN', 'SOF',
'SOU', 'SPU', 'STN', 'STR', 'SUF', 'SVG', 'SVO', 'SVQ', 'SVX', 'SXB', 'SXF', 'SZG', 'TBS', 'TFN',
'TFS', 'TGD', 'TIA', 'TIV', 'TKU', 'TLL', 'TLN', 'TLS', 'TMP', 'TOS', 'TPS', 'TRD', 'TRF', 'TRN',
'TSE', 'TSF', 'TSR', 'TXL', 'TZL', 'TZX', 'VAA', 'VAR', 'VCE', 'VIE', 'VKO', 'VLC', 'VNO', 'VRN',
'VST', 'WAW', 'WMI', 'WRO', 'XRY', 'ZAD', 'ZAG', 'ZAZ', 'ZRH', 'ZTH']
eu_countries = ['Albania', 'Armenia', 'Austria', 'Azerbaijan', 'Belarus', 'Belgium', 'Bosnia And Herzegovina',
'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Faroe Islands', 'Finland',
'France', 'Georgia', 'Germany', 'Gibraltar', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy',
'Kosovo', 'Latvia', 'Lithuania', 'Luxembourg', 'Macedonia', 'Malta', 'Moldova', 'Monaco', 'Montenegro',
'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'Serbia', 'Slovakia', 'Slovenia',
'Spain', 'Sweden', 'Switzerland', 'Ukraine', 'United Kingdom']
"""
eu_airlines_names = ['Aegean Airlines', 'Aer Lingus', 'Aeroflot', 'Air Baltic', 'Air Europa', 'Air France', 'Alitalia',
'Austrian Airlines', 'Blue Air', 'BRA', 'British Airways', 'Brussels Airlines', 'Condor', 'EasyJet',
'Eurowings', 'Finnair', 'Flybe', 'Germania', 'HOP!', 'Iberia', 'Icelandair', 'Jet2', 'KLM', 'LOT',
'Lufthansa', 'Norwegian', 'Ryanair', 'S7 Airlines', 'SAS', 'Swiftair', 'Swiss', 'TAP Portugal',
'Thomas Cook Airlines', 'Transavia', 'Travel Service', 'TUI fly', 'Ukraine Int. Airlines', 'Ural Airlines',
'Virgin Atlantic', 'Volotea', 'Vueling', 'Wideroe', 'Wizz Air']
eu_airlines_iata = ['a3-aee', 'ei-ein', 'su-afl', 'bt-bti', 'ux-aea', 'af-afr', 'az-aza', 'os-aua', '0b-bms', 'tf-brx',
'ba-baw', 'sn-bel', 'de-cfg', 'u2-ezy', 'ew-ewg', 'ay-fin', 'be-bee', 'st-gmi', 'a5-hop', 'ib-ibe',
'fi-ice', 'ls-exs', 'kl-klm', 'lo-lot', 'lh-dlh', 'dy-nax', 'fr-ryr', 's7-sbi', 'sk-sas', 'wt-swt',
'lx-swr', 'tp-tap', 'mt-tcx', 'hv-tra', 'qs-tvs', 'x3-tui', 'ps-aui', 'u6-svr', 'vs-vir', 'v7-voe',
'vy-vlg', 'wf-wif', 'w6-wzz']
"""
eu_airlines_names = ['KLM']
eu_airlines_iata = ['kl-klm']
for airline in eu_airlines_iata:
s = requests.session()
r = s.get('https://www.flightradar24.com/data/airlines/' + airline + '/routes', proxies = myProxy, headers = headers)
my_json = json.loads(r.text.split('arrRoutes=')[-1].split(', arrDates=')[0])
iata_list = [element[item]['iata'] for element in my_json for item in element]
iata_list2 = []
iata_list1 = set(iata_list)
for i in iata_list1:
if i not in eu_airports:
pass
else:
iata_list2.append(i)
print(len(iata_list2))
today = datetime.datetime.today()
tomorrow1 = datetime.datetime.today() + datetime.timedelta(1)
tomorrow2 = datetime.datetime.today() + datetime.timedelta(2)
tomorrow3 = datetime.datetime.today() + datetime.timedelta(3)
tomorrow4 = datetime.datetime.today() + datetime.timedelta(4)
tomorrow5 = datetime.datetime.today() + datetime.timedelta(5)
tomorrow6 = datetime.datetime.today() + datetime.timedelta(6)
date = datetime.datetime.strftime(today, "%Y-%m-%d")
date1 = datetime.datetime.strftime(tomorrow1, "%Y-%m-%d")
date2 = datetime.datetime.strftime(tomorrow2, "%Y-%m-%d")
date3 = datetime.datetime.strftime(tomorrow3, "%Y-%m-%d")
date4 = datetime.datetime.strftime(tomorrow4, "%Y-%m-%d")
date5 = datetime.datetime.strftime(tomorrow5, "%Y-%m-%d")
date6 = datetime.datetime.strftime(tomorrow6, "%Y-%m-%d")
countries = []
airports_departure = []
airports_arrival = []
dailyflights = []
distances = []
flights = []
aircrafts = []
airlines = []
for airline, name in zip(eu_airlines_iata, eu_airlines_names):
url = 'https://www.flightradar24.com/data/airlines/' + airline + '/routes?get-airport-arr-dep={}'
print(url)
for abbr in iata_list2:
try:
cookie = r.cookies.get_dict()
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0", "Content-Type": "application/json", "x-fetch": "true"}
response = s.get(url.format(abbr), cookies=cookie, headers=headers, proxies = myProxy).json()
for country in response['arrivals']:
if country in eu_countries:
countries.append(country)
daily = response['arrivals'][country]['number']['flights']
if abbr not in airports_departure and abbr not in airports_arrival:
for iata in response['arrivals'][country]['airports']:
if iata in eu_airports and abbr not in airports_departure:
airports_arrival.append(iata)
dist = response['arrivals'][country]['airports'][iata]['distance']
distances.append(int(round(dist/1000)))
for flight in response['arrivals'][country]['airports'][iata]['flights']:
aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date]["aircraft"]
print('Scraping data...')
if abbr not in airports_departure:
airports_departure.append(abbr)
aircrafts.append(aircr)
airlines.append(name)
dailyflights.append(daily)
else:
pass
except (IndexError, KeyError, TypeError, ValueError):
try:
if abbr not in airports_departure:
aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date2]["aircraft"]
aircrafts.append(aircr)
airlines.append(name)
airports_departure.append(abbr)
dailyflights.append(daily)
except (IndexError, KeyError, TypeError, ValueError):
try:
if abbr not in airports_departure:
aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date3]["aircraft"]
aircrafts.append(aircr)
airlines.append(name)
airports_departure.append(abbr)
dailyflights.append(daily)
except (IndexError, KeyError, TypeError, ValueError):
try:
if abbr not in airports_departure:
aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date4]["aircraft"]
aircrafts.append(aircr)
airlines.append(name)
airports_departure.append(abbr)
dailyflights.append(daily)
except (IndexError, KeyError, TypeError, ValueError):
try:
if abbr not in airports_departure:
aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date5]["aircraft"]
aircrafts.append(aircr)
airlines.append(name)
airports_departure.append(abbr)
dailyflights.append(daily)
except (IndexError, KeyError, TypeError, ValueError):
try:
if abbr not in airports_departure:
aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date6]["aircraft"]
aircrafts.append(aircr)
airlines.append(name)
airports_departure.append(abbr)
dailyflights.append(daily)
except (IndexError, KeyError, TypeError, ValueError):
if abbr not in airports_departure:
aircrafts.append('')
airlines.append('')
airports_departure.append('')
dailyflights.append(0)
print('Airline: ' + str(airlines))
print('Departure: ' + str(airports_departure))
print('Arrival: ' + str(airports_arrival))
print('Aircraft types: ' + str(aircrafts))
print('Distance (km): ' + str(distances))
print('Daily flights: ' + str(dailyflights))
print('Airline: ' + str(len(airlines)))
print('Departure: ' + str(len(airports_departure)))
print('Arrival: ' + str(len(airports_arrival)))
print('Aircrafts: ' + str(len(aircrafts)))
print('Distance: ' + str(len(distances)))
print('Daily flights: ' + str(len(dailyflights)))
print('Sum daily flights: ' + str(sum(dailyflights)))
df = pd.DataFrame({'Airline': airlines,
'Departure': airports_departure,
'Arrivals': airports_arrival,
'Aircraft': aircrafts,
'Distance': distances,
'Daily flights': dailyflights})
print(df)
这对于荷航很有效,因为它的所有航班都只有一个机场枢纽(史基浦机场)。但是,当我尝试为像Ryanair这样的航空公司收集数据时遇到问题,该航空公司在欧洲拥有多个枢纽。在代码中,这可以通过将列表eu_airlines_names和eu_airlines_iata中的元素从“ KLM”和“ kl-klm”更改为“ Ryanair”和“ fr-ryr”来完成。
如何调整刮板以解决此问题?另外,是否可以遍历列表eu_airlines_iata中的几个元素,而不是一个一个地做呢?另外,现在代码仅会刮擦随机的飞机型号,但是有没有办法代替刮擦整个星期最常用的飞机呢?
理想的输出应该是包含以下内容的单独列表:
对于我在代码中的文档字符串中放置的eu_airlines_iata列表中的每个航空公司。