Web刮html加油站

时间:2018-03-29 18:16:03

标签: python web-scraping beautifulsoup

鉴于此链接http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml我想要刮取每个加油站及其信息

import requests
from bs4 import BeautifulSoup
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
for x in soup.find_all('p'):
    print x

提取正确数据的下一步是什么?

2 个答案:

答案 0 :(得分:3)

更新的 这是@ Dan-Dev的最终代码。这有点讨厌...请原谅,我没有时间写更短的代码。

import re
import requests
from bs4 import BeautifulSoup
from pprint import pprint

def is_phone_number(txt):
    r = re.compile(r"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})")
    return r.match(txt)

def is_gas_type(txt):
    return any(term in txt.lower() for term in ['lead', 'diesel'])

def is_lat_lon(txt):
    return any(term in txt.lower() for term in ['lat', 'lon'])

def is_hour_of_operation(txt):
    return any(term in txt.lower() for term in ['24 hrs', ' am ', ' pm ', 'm-f'])

def str_colon_list_to_str_float_dict(rlist):
    """["a:1.0", "b:2.0"] => {"a":1.0, "b":2.0}"""
    intermediate_dict = dict(map(lambda s: s.split(':'), rlist))
    return dict((k, float(v)) for k, v in intermediate_dict.iteritems())

page = requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all('table')[5]

gas_stations = []
for x in table.find_all('td', {'valign': 'top', 'colspan': None, 'width': None}):
    gas_station = []
    # split text on line breaks and then remove whitespace
    for y in x.text.splitlines():
        line = ' '.join(y.split())
        gas_station.append(line)
    # skip lines consisting of only empty strings
    if not ('' in set(gas_station) and len(set(gas_station)) == 1):
        gas_stations.append(gas_station)

gas_stations_dict = {}
for gas_station in gas_stations:
    gas_station_dict = {}
    address_list = []
    lat_long_list = []
    for i, g in enumerate(gas_station):
        g = g.encode("utf-8")
        if i == 0:
            gas_station_dict['Name'] = g
        elif is_phone_number(g):
            gas_station_dict['Phone Number'] = g
        elif is_lat_lon(g):
            lat_long_list.append(g)
        elif is_gas_type(g):
            gas_station_dict['Gas Type'] = g
        elif is_hour_of_operation(g):
            gas_station_dict['Hours of Operation'] = g
        else:
            address_list.append(g)
    gas_station_dict['Coordinates'] = str_colon_list_to_str_float_dict(lat_long_list)
    gas_station_dict['Address'] = ' '.join(address_list)

    gas_stations_dict[gas_station_dict['Name']] = gas_station_dict

pprint(gas_stations_dict)

<强>结果:

    {'Bayside Facility': {'Address': '4294 Rt. 47 Leesburg',
                           'Coordinates': {'Latitude': 39.23339997,
                                           'Longitude': -74.96568202},
                           'Gas Type': 'Unleaded / Diesel',
                           'Hours of Operation': 'Open 24 Hrs',
                           'Name': 'Bayside Facility',
                           'Phone Number': '856-785-0040 X-5429'},
     'Bedminster DOT': {'Address': '455 Rt. 202/206 South Pluckemin',
                         'Coordinates': {'Latitude': 40.65123677,
                                         'Longitude': -74.64499021},
                         'Gas Type': 'Unleaded / Diesel',
                         'Hours of Operation': 'Open 24 Hrs',
                         'Name': 'Bedminster DOT',
                         'Phone Number': '908-234-2130'},
        ...
        }

我的旧答案:
我尝试使用Selector Gadget,就像我在评论中提到的那样,但我没有在html中找到任何可以获取所有电台名称的一致模式。我刮了很多,我和很多政府网站都有同样的问题。我不知道这是不是无能,还是故意为了防止刮擦......无论如何,这里有一些代码打印出一些信息:

import requests
from bs4 import BeautifulSoup, NavigableString
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
for x in soup.find_all('p'):
    for y in x:
        if isinstance(y, NavigableString):
            print y.encode("utf-8")
        else:
            for z in y:
                if isinstance(z, NavigableString):
                    print z.encode("utf-8")

从这一点开始,您可以根据所需信息对其进行修改。看看它,看起来每个电台组的最后一行是“经度”。

最后,完成后,我会注意它,以确保您拥有所需的所有信息。例如,当您向find_all输入p标签时,Folsom DOT不会被拉出。

答案 1 :(得分:1)

在给定HTML的情况下,您几乎没有选择。所以你可以取消选择你想要的东西。你想要第6个表,你不希望td元素与&#34; colspan&#34;属性或任何&#34;宽度&#34;属性。但是td元素必须具有valign top。

int main()
{
    fstream file;
    string line;

    file.open("beef.txt", ios_base::out | ios_base::trunc);
    if (file.is_open())
    {
        file << "beef" << endl;
        file.close();
    }

    file.open("beef.txt", ios_base::in);
    if (file.is_open())
    {
        getline(file, line);
        file.close();

        cout << "File contains:" << endl;
        cout << line << endl;
    }

    cout << "Enter what you would like to be contained in the file" << endl;
    getline(cin, line);

    file.open("beef.txt", ios_base::out | ios_base::trunc);
    if (file.is_open())
    {
        file << line << endl;
        file.close();
    }

    file.open("beef.txt", ios_base::in);
    if (in_file.is_open())
    {
        getline(in_file, line);
        in_file.close();

        cout << "File now contains:" << endl;
        cout << line << endl;
    }

    return 0;
}

更新以回应评论

import requests
from bs4 import BeautifulSoup
page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
table = soup.find_all('table')[5]
for x in table.find_all('td', {'valign': 'top', 'colspan': None, 'width':None}):
    print (x.text)
    print ('#############')

输出:

import requests
from bs4 import BeautifulSoup
import pprint
import re

page=requests.get("http://www.nj.gov/treasury/administration/statewide-support/motor-fuel-locations.shtml")
soup=BeautifulSoup(page.content,'html.parser')
table = soup.find_all('table')[5]

tmps = [i.text for i in table.find_all('td', {'valign': 'top', 'colspan': None, 'width':None})]
my_dict = dict()
for tmp in tmps:
    if len(tmp.strip()) != 0:
        # Clean up the output and add to dictionary.
        my_dict[re.sub( '\s+', ' ', tmp.split('\n', 1)[0] ).strip()] = re.sub( '\s+', ' ', tmp ).strip()

pp = pprint.PrettyPrinter(indent=4)
pp.pprint (my_dict)

但它错过了两个加油站:

{   'Bayside Facility': 'Bayside Facility 4294 Rt. 47 Leesburg Open 24 Hrs '
                        'Unleaded / Diesel 856-785-0040 X-5429 Latitude: '
                        '39.23339997 Longitude: -74.96568202',
    'Bedminster DOT': 'Bedminster DOT 455 Rt. 202/206 South Pluckemin Open 24 '
                      'Hrs Unleaded / Diesel 908-234-2130 Latitude: '
                      '40.65123677 Longitude: -74.64499021',
    'Berlin DOT': 'Berlin DOT 36 Walker Ave. Berlin Open 24 Hrs Unleaded / '
                  'Diesel 856-767-7717 Latitude: 39.80369329 Longitude: '
                  '-74.93442722',
    'Bloomsbury DOT': 'Bloomsbury DOT 1000 Rt. 173 Bloomsbury Open 24 Hrs '
                      'Unleaded / Diesel 908-479-4851 Latitude: 40.66078600 '
                      'Longitude: -75.06664165',
    'Bordentown DOT': 'Bordentown DOT Dunns Mill Rd. -off Rt. 130 Bordentown '
                      'Unleaded -Open 24 Hrs Diesel – 7:30 am -3:45 pm M-F '
                      '609-298-2980 Latitude: 40.13178135 Longitude: '
                      '-74.71658907',
...

由于它们具有宽度属性,因此在HTML中也无法选择这些属性。