从Flightradar24抓取数据

时间:2018-08-15 15:09:48

标签: python python-3.x web-scraping python-requests web-crawler

我正在尝试创建一个刮板,以返回欧洲各机场之间每日航班的数据,以获取欧洲航空公司的列表。对于荷航,可通过单击地图上的点在以下网站上找到数据(数据显示在地图下的表格中): https://www.flightradar24.com/data/airlines/kl-klm/routes

我目前有以下代码:

import requests
import json
import datetime
import pandas as pd

myProxy = {"http"  : "http://10.120.118.49:8080", "https"  : "https://10.120.118.49:8080"}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0"}

eu_airports = ['AAL', 'AAR', 'ABZ', 'ACE', 'ADA', 'ADB', 'AER', 'AES', 'AGP', 'AHO', 'AJA', 'ALA', 'ALC', 'AMS',
           'ANR', 'AOI', 'ARN', 'ATH', 'AYT', 'BCN', 'BDS', 'BEG', 'BER', 'BES', 'BFS', 'BGO', 'BGY', 'BHD', 
           'BHX', 'BIA', 'BIO', 'BIQ', 'BJV', 'BLL', 'BLQ', 'BMA', 'BOD', 'BOJ', 'BOO', 'BRE', 'BRI', 'BRN', 
           'BRQ', 'BRS', 'BRU', 'BTS', 'BUD', 'BVA', 'CAG', 'CDG', 'CFU', 'CGN', 'CHQ', 'CIA', 'CIY', 'CLJ', 
           'CPH', 'CRL', 'CTA', 'CWL', 'DBV', 'DEB', 'DLM', 'DME', 'DRS', 'DTM', 'DUB', 'DUS', 'EDI', 'EGC', 
           'EIN', 'EMA', 'ESB', 'EVN', 'FAO', 'FCO', 'FDH', 'FKB', 'FLR', 'FMM', 'FMO', 'FNC', 'FRA', 'FSC', 
           'FUE', 'GDN', 'GLA', 'GOA', 'GOT', 'GRO', 'GRQ', 'GRZ', 'GVA', 'GYD', 'HAJ', 'HAM', 'HAU', 'HEL', 
           'HER', 'HHN', 'HUY', 'IAS', 'IBZ', 'IEV', 'INI', 'INN', 'IST', 'JER', 'JMK', 'JTR', 'KBP', 'KEF', 
           'KGS', 'KIR', 'KIV', 'KLU', 'KRK', 'KRS', 'KTW', 'KUN', 'LBA', 'LCA', 'LCY', 'LED', 'LEI', 'LEJ', 
           'LGG', 'LGW', 'LHR', 'LIL', 'LIN', 'LIS', 'LJU', 'LNZ', 'LPA', 'LPL', 'LTN', 'LUG', 'LUX', 'LYS', 
           'MAD', 'MAH', 'MAN', 'MJV', 'MLA', 'MMX', 'MPL', 'MRS', 'MSQ', 'MST', 'MUC', 'MXP', 'NAP', 'NCE', 
           'NCL', 'NOC', 'NRN', 'NTE', 'NUE', 'NYO', 'ODS', 'OLB', 'OPO', 'ORK', 'ORY', 'OSL', 'OST', 'OTP', 
           'OUL', 'PAD', 'PDL', 'PEG', 'PFO', 'PIK', 'PMI', 'PMO', 'POZ', 'PRG', 'PRN', 'PSA', 'PSR', 'PUY', 
           'REU', 'RHO', 'RIX', 'RTM', 'RVN', 'SAW', 'SCQ', 'SDR', 'SEN', 'SJJ', 'SKG', 'SKP', 'SNN', 'SOF', 
           'SOU', 'SPU', 'STN', 'STR', 'SUF', 'SVG', 'SVO', 'SVQ', 'SVX', 'SXB', 'SXF', 'SZG', 'TBS', 'TFN', 
           'TFS', 'TGD', 'TIA', 'TIV', 'TKU', 'TLL', 'TLN', 'TLS', 'TMP', 'TOS', 'TPS', 'TRD', 'TRF', 'TRN', 
           'TSE', 'TSF', 'TSR', 'TXL', 'TZL', 'TZX', 'VAA', 'VAR', 'VCE', 'VIE', 'VKO', 'VLC', 'VNO', 'VRN', 
           'VST', 'WAW', 'WMI', 'WRO', 'XRY', 'ZAD', 'ZAG', 'ZAZ', 'ZRH', 'ZTH']

eu_countries = ['Albania', 'Armenia', 'Austria', 'Azerbaijan', 'Belarus', 'Belgium', 'Bosnia And Herzegovina', 
            'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia', 'Faroe Islands', 'Finland', 
            'France', 'Georgia', 'Germany', 'Gibraltar', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 
            'Kosovo', 'Latvia', 'Lithuania', 'Luxembourg', 'Macedonia', 'Malta', 'Moldova', 'Monaco', 'Montenegro', 
            'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'Serbia', 'Slovakia', 'Slovenia', 
            'Spain', 'Sweden', 'Switzerland', 'Ukraine', 'United Kingdom']

"""
eu_airlines_names = ['Aegean Airlines', 'Aer Lingus', 'Aeroflot', 'Air Baltic', 'Air Europa', 'Air France', 'Alitalia', 
                 'Austrian Airlines', 'Blue Air', 'BRA', 'British Airways', 'Brussels Airlines', 'Condor', 'EasyJet', 
                 'Eurowings', 'Finnair', 'Flybe', 'Germania', 'HOP!', 'Iberia', 'Icelandair', 'Jet2', 'KLM', 'LOT', 
                 'Lufthansa', 'Norwegian', 'Ryanair', 'S7 Airlines', 'SAS', 'Swiftair', 'Swiss', 'TAP Portugal', 
                 'Thomas Cook Airlines', 'Transavia', 'Travel Service', 'TUI fly', 'Ukraine Int. Airlines', 'Ural Airlines', 
                 'Virgin Atlantic', 'Volotea', 'Vueling', 'Wideroe', 'Wizz Air']

eu_airlines_iata = ['a3-aee', 'ei-ein', 'su-afl', 'bt-bti', 'ux-aea', 'af-afr', 'az-aza', 'os-aua', '0b-bms', 'tf-brx', 
                'ba-baw', 'sn-bel', 'de-cfg', 'u2-ezy', 'ew-ewg', 'ay-fin', 'be-bee', 'st-gmi', 'a5-hop', 'ib-ibe', 
                'fi-ice', 'ls-exs', 'kl-klm', 'lo-lot', 'lh-dlh', 'dy-nax', 'fr-ryr', 's7-sbi', 'sk-sas', 'wt-swt', 
                'lx-swr', 'tp-tap', 'mt-tcx', 'hv-tra', 'qs-tvs', 'x3-tui', 'ps-aui', 'u6-svr', 'vs-vir', 'v7-voe', 
                'vy-vlg', 'wf-wif', 'w6-wzz']
"""
eu_airlines_names = ['KLM']
eu_airlines_iata = ['kl-klm']

for airline in eu_airlines_iata:
    s = requests.session()
    r = s.get('https://www.flightradar24.com/data/airlines/' + airline + '/routes', proxies = myProxy, headers = headers)
    my_json = json.loads(r.text.split('arrRoutes=')[-1].split(', arrDates=')[0])
    iata_list = [element[item]['iata'] for element in my_json for item in element]

iata_list2 = []
iata_list1 = set(iata_list)

for i in iata_list1:
    if i not in eu_airports:
        pass
    else:
        iata_list2.append(i)

print(len(iata_list2))

today = datetime.datetime.today()
tomorrow1 = datetime.datetime.today() + datetime.timedelta(1)
tomorrow2 = datetime.datetime.today() + datetime.timedelta(2)
tomorrow3 = datetime.datetime.today() + datetime.timedelta(3)
tomorrow4 = datetime.datetime.today() + datetime.timedelta(4)
tomorrow5 = datetime.datetime.today() + datetime.timedelta(5)
tomorrow6 = datetime.datetime.today() + datetime.timedelta(6)

date = datetime.datetime.strftime(today, "%Y-%m-%d")
date1 = datetime.datetime.strftime(tomorrow1, "%Y-%m-%d")
date2 = datetime.datetime.strftime(tomorrow2, "%Y-%m-%d")
date3 = datetime.datetime.strftime(tomorrow3, "%Y-%m-%d")
date4 = datetime.datetime.strftime(tomorrow4, "%Y-%m-%d")
date5 = datetime.datetime.strftime(tomorrow5, "%Y-%m-%d")
date6 = datetime.datetime.strftime(tomorrow6, "%Y-%m-%d")

countries = []
airports_departure = []
airports_arrival = []
dailyflights = []
distances = []
flights = []
aircrafts = []
airlines = []

for airline, name in zip(eu_airlines_iata, eu_airlines_names):
    url = 'https://www.flightradar24.com/data/airlines/' + airline + '/routes?get-airport-arr-dep={}'
    print(url)

    for abbr in iata_list2:
        try:
            cookie = r.cookies.get_dict()
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0", "Content-Type": "application/json", "x-fetch": "true"}
            response = s.get(url.format(abbr), cookies=cookie, headers=headers, proxies = myProxy).json()

            for country in response['arrivals']:
                if country in eu_countries:
                    countries.append(country)
                    daily = response['arrivals'][country]['number']['flights']

                if abbr not in airports_departure and abbr not in airports_arrival:
                    for iata in response['arrivals'][country]['airports']:
                        if iata in eu_airports and abbr not in airports_departure:
                            airports_arrival.append(iata)
                            dist = response['arrivals'][country]['airports'][iata]['distance']
                            distances.append(int(round(dist/1000)))
                            for flight in response['arrivals'][country]['airports'][iata]['flights']:
                                aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date]["aircraft"]

                            print('Scraping data...')

                            if abbr not in airports_departure:
                                airports_departure.append(abbr)
                                aircrafts.append(aircr)
                                airlines.append(name)
                                dailyflights.append(daily)

                else:
                    pass                    

        except (IndexError, KeyError, TypeError, ValueError):
            try:
                if abbr not in airports_departure:
                    aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date2]["aircraft"]
                    aircrafts.append(aircr)  
                    airlines.append(name)
                    airports_departure.append(abbr)
                    dailyflights.append(daily)
            except (IndexError, KeyError, TypeError, ValueError):
                try:
                    if abbr not in airports_departure:
                        aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date3]["aircraft"]
                        aircrafts.append(aircr)  
                        airlines.append(name)
                        airports_departure.append(abbr)
                        dailyflights.append(daily)
                except (IndexError, KeyError, TypeError, ValueError):
                    try:
                        if abbr not in airports_departure:
                            aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date4]["aircraft"]
                            aircrafts.append(aircr)  
                            airlines.append(name)
                            airports_departure.append(abbr)
                            dailyflights.append(daily)
                    except (IndexError, KeyError, TypeError, ValueError):
                        try:
                            if abbr not in airports_departure:
                                aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date5]["aircraft"]
                                aircrafts.append(aircr)  
                                airlines.append(name)
                                airports_departure.append(abbr)
                                dailyflights.append(daily)
                        except (IndexError, KeyError, TypeError, ValueError):
                            try:
                                if abbr not in airports_departure:
                                    aircr = response['arrivals'][country]['airports'][iata]["flights"][flight]["utc"][date6]["aircraft"]
                                    aircrafts.append(aircr)  
                                    airlines.append(name)
                                    airports_departure.append(abbr)
                                    dailyflights.append(daily)
                            except (IndexError, KeyError, TypeError, ValueError):
                                if abbr not in airports_departure:
                                    aircrafts.append('')
                                    airlines.append('')
                                    airports_departure.append('')
                                    dailyflights.append(0)


print('Airline: ' + str(airlines))
print('Departure: ' + str(airports_departure))
print('Arrival: ' + str(airports_arrival))
print('Aircraft types: ' + str(aircrafts))
print('Distance (km): ' + str(distances))
print('Daily flights: ' + str(dailyflights))

print('Airline:           ' + str(len(airlines)))
print('Departure:         ' + str(len(airports_departure)))
print('Arrival:           ' + str(len(airports_arrival)))
print('Aircrafts:         ' + str(len(aircrafts)))
print('Distance:          ' + str(len(distances)))
print('Daily flights:     ' + str(len(dailyflights)))
print('Sum daily flights: ' + str(sum(dailyflights)))


df = pd.DataFrame({'Airline': airlines, 
                   'Departure': airports_departure, 
                   'Arrivals': airports_arrival, 
                   'Aircraft': aircrafts, 
                   'Distance': distances,
                   'Daily flights': dailyflights})
print(df)    

这对于荷航很有效,因为它的所有航班都只有一个机场枢纽(史基浦机场)。但是,当我尝试为像Ryanair这样的航空公司收集数据时遇到问题,该航空公司在欧洲拥有多个枢纽。在代码中,这可以通过将列表eu_airlines_names和eu_airlines_iata中的元素从“ KLM”和“ kl-klm”更改为“ Ryanair”和“ fr-ryr”来完成。

如何调整刮板以解决此问题?另外,是否可以遍历列表eu_airlines_iata中的几个元素,而不是一个一个地做呢?另外,现在代码仅会刮擦随机的飞机型号,但是有没有办法代替刮擦整个星期最常用的飞机呢?

理想的输出应该是包含以下内容的单独列表:

  • 航空公司名称
  • 出发机场
  • 到达机场
  • 最常用的飞机类型
  • 距离
  • 每日航班数

对于我在代码中的文档字符串中放置的eu_airlines_iata列表中的每个航空公司。

0 个答案:

没有答案