使用Python中的某些文本进行Web刮HTML表

时间:2018-09-14 16:37:06

标签: python html web-scraping html-table

我正在尝试使用python Web剪贴HTML表。 HTML页面中有很多表,但是我只想废弃某个表。我正在用漂亮的汤刮网。

我的代码如下:

page = get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

for p in html.select('tr'):
    if p.text == "ARGOR CAST BAR":
        print (p.text)

我只需要显示“截至2018年9月10日星期一的费率”的表。

我该怎么做?

2 个答案:

答案 0 :(得分:1)

您需要找到包含文本和作为表格的父项的元素:

import re
import requests
from bs4 import BeautifulSoup

page = requests.get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

element = html.find(text=re.compile('Rate as at Monday, 10 September 2018'))
print(element.findParent('table'))

答案 1 :(得分:1)

from collections import defaultdict

import requests
from bs4 import BeautifulSoup


def get_page_html(url):
    r = requests.get(url)
    r.raise_for_status()
    return r.text


def parse_last_table(html):
    prev_key = None
    result = defaultdict(list)
    soup = BeautifulSoup(html, 'lxml')

    last_table = soup.find_all('table')[-1]
    for row in last_table.find_all('tr')[2:]:
        try:
            description, currency, unit, bank_sells, bank_buys = (
                col.text.strip() for col in row.find_all('td')
            )
        except ValueError:
            continue  # blank/empty row

        description = description or prev_key
        result[description].append({
            'currency': currency,
            'unit': unit,
            'bank_sells': bank_sells,
            'bank_buys': bank_buys
        })
        prev_key = description
    return result

输出:

>>> url = 'http://uobgoldprice.com/history/2018/September/10/'
>>> page_html = get_page_html(url)
>>> result = parse_last_table(page_html)
>>> import json; print(json.dumps(result, indent=2))
{
  "ARGOR CAST BAR": [
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,369.00 (+4.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "CAST BARS": [
    {
      "currency": "SGD",
      "unit": "1 KILOBAR",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD CERTIFICATE": [
    {
      "currency": "SGD",
      "unit": "1 KILOCERT",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD SAVINGS A/C": [
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "53.20 (+0.04)",
      "bank_buys": "52.94 (+0.04)"
    }
  ],
  "GOLD BULLION COINS": [
    {
      "currency": "SGD",
      "unit": "1/20 OZ(GNC,SLC &GML)",
      "bank_sells": "131.00",
      "bank_buys": "81.00"
    },
    {
      "currency": "SGD",
      "unit": "1/10 OZ",
      "bank_sells": "211.00 (+1.00)",
      "bank_buys": "163.00"
    },
    {
      "currency": "SGD",
      "unit": "1/4 OZ",
      "bank_sells": "465.00",
      "bank_buys": "410.00"
    },
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "904.00 (+1.00)",
      "bank_buys": "822.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,726.00 (+1.00)",
      "bank_buys": "1,645.00 (+1.00)"
    }
  ],
  "PAMP GOLD BARS": [
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "876.00",
      "bank_buys": "821.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "82.00",
      "bank_buys": "50.00"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,711.00 (+1.00)",
      "bank_buys": "1,644.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "2.5 GM",
      "bank_sells": "182.00",
      "bank_buys": "130.00"
    },
    {
      "currency": "SGD",
      "unit": "5 GM",
      "bank_sells": "322.00",
      "bank_buys": "262.00"
    },
    {
      "currency": "SGD",
      "unit": "10 GM",
      "bank_sells": "597.00 (+1.00)",
      "bank_buys": "527.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "20 GM",
      "bank_sells": "1,132.00 (+1.00)",
      "bank_buys": "1,056.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "50 GM",
      "bank_sells": "2,746.00 (+2.00)",
      "bank_buys": "2,644.00 (+2.00)"
    },
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,414.00 (+3.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "SILVER PASSBOOK ACCOUNT": [
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "19.86 (+0.09)",
      "bank_buys": "19.30 (+0.09)"
    }
  ]
}