美丽的汤。 replace_with()如何更改其他标签

时间:2018-12-06 09:40:53

标签: python-3.x web-scraping beautifulsoup

我很ask愧地问这个。我在用replace_with()挣扎。只是想了解如何更改标签。

这里有个例子:

'use strict'
const async = require('async')

function api_hit(callback) {
    setTimeout(() => {
        console.log('Completed api_hit')
        callback(null, 'api_hit')
    }, 1000)
}

function delay(callback) {
    setTimeout(() => {
        console.log('Completed delay')
        callback(null, 'delay')
    }, 100)
}

function mysql_check(callback) {
    setTimeout(() => {
        console.log('Completed mysql_check')
        callback(null, 'mysql_check')
    }, 500)
}

var tasklist = [api_hit, delay, mysql_check];
if (tasklist.length > 0) {
    async.series(
        tasklist,
        function (err, response) {
            console.log(err);
            console.log(response);
        }
    );
}

更改一些虚拟错误之后。我只需要获取原始文本,而无需更改标签:

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = "http://example.com/blabla/blublu/tata"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")  
page = page_soup.select_one(".list")
print(page)
if page:
    url = page.get("href")  
    newUrl = "http://example.com{}".format(url)
    Client = uReq(newUrl)
    pageHtml = Client.read()
    Client.close()      
    pSoup = soup(pageHtml, "html.parser")
    spot = pSoup.select(".view_details > h3")
    spots = spot[0].text
    print(spots)
    contain = pSoup.select(".info > table")

    #HERE I AM TRYING TO MAKE THE MAGIC

    for table in contain:
        table.find_all("table")
        table.replace_with("div")
    contains = contain[0]
    print(contains)

如果有人可以在这黑暗的时刻给我一些光明,那将是一件令人高兴的事。

1 个答案:

答案 0 :(得分:0)

尝试以此作为基础,因为它会将标记名table更改为div

soup = BeautifulSoup('''<table>
    <tr valign="top">
        <td>
            <div>Lorem ipsum...</div>
        </td>
    </tr>
</table>''', 'html.parser')

tag = soup.table

tag.name = 'div'

print(tag)

输出:

<div>
 <tr valign="top">
  <td>
   <div>
    Lorem ipsum...
   </div>
  </td>
 </tr>
</div>