如何抓取所有内容?(python3.6)

时间:2018-09-16 14:23:52

标签: python beautifulsoup web-crawler

当使用python3和BeautifulSoup从网络上获取指定的内容时,我无法在“ td”中获取所有信息。

这是我的代码

import requests
from bs4 import BeautifulSoup

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ''
def main():
    try:
        url = "http://baike.hrhrs.com/index.php?doc-view-3967.html"
        html = getHTMLText(url)
        soup = BeautifulSoup(html, 'html.parser')
        # print(soup.prettify())

        for tag in soup.find_all('h1',class_='title_thema'):
            name = tag.find('span', id='doctitle').get_text()

        # the first one
        table1 = soup.find('table',attrs={'id': 'jqe-table-0'})
        tr = table1.find_all('tr')
        for trr in tr:
            td = trr.findAll('td')
            print(td)

这是输出

    [<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A1:</strong>A2</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);"A3:</strong>A4</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A5:</strong>A6</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A7:</strong>A8</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A9:</strong>A10</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A11:</strong>A12</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A13:</strong>A14</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A15:</strong>A16</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A17:</strong>A18</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A19:</strong>A20</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A21:</strong>A22</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A23:</strong>A24</td>]

我使用代码:

print(td[0].text)

结果为:

A1:A2
A5:A6
A9:A10
A13:A14
A17:A18
A21:A22

我想获取“ td”中的所有内容,例如“ A3:A4”并继续。 我该如何更改代码以获取全部内容。希望您的答复!

3 个答案:

答案 0 :(得分:1)

根据我的原始代码,只需得到如下结果:

中文名:柳公权
别名:诚悬
出生地:京兆华原(今陕西铜川市耀州区)
民族:汉族
出生年月:公元778年
职业:书法家

更改代码:

import requests
from bs4 import BeautifulSoup

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ''

def main():
    try:
       for count in range(100,1000):
           url = "http://baike.hrhrs.com/index.php?doc-view-"+str(count)+".html"
           html = getHTMLText(url)
           soup = BeautifulSoup(html, 'html.parser')
           # print(soup.prettify())

           for tag in soup.find_all('h1', class_='title_thema'):
               name = tag.find('span', id='doctitle').get_text()
               n[0] = name;

           tr = soup.find_all('tr')
           for trr in tr:
               tdlist = trr.find_all('td')
               for i in range(len(tdlist)):
                  print(str(tdlist[i].text))       
    except:
        print("error")

    print("successfully!")


main()

结果:

中文名:柳公权
中文名:柳公权
别名:诚悬
籍贯:唐朝京兆华原(今陕西耀县)
出生地:京兆华原(今陕西铜川市耀州区)
性别:男
民族:汉族
国籍:中国
......

所有内容均可用。

答案 1 :(得分:0)

使用您的代码,在Jupyter Notebook中逐步运行,我得到了

中文名:柳公权
别名:诚悬
出生地:京兆华原(今陕西铜川市耀州区)
民族:汉族
出生年月:公元778年
职业:书法家

你是这个意思吗?

这是我的更改:

import requests
from bs4 import BeautifulSoup

url = "http://baike.hrhrs.com/index.php?doc-view-3967.html"

def main():
  global url

  r = requests.get(url)
  soup = BeautifulSoup(r.text, 'html.parser')
  tags = soup.find_all('h1', class_='title_thema')

  for tag in tags:
    name = tag.find('span', id='doctitle').get_text()
    print(name)

  table1 = soup.find('table', attrs={'id':'jqe-table-0'})
  tr = table1.find_all('tr')
  for trr in tr:
    td = trr.findAll('td')
    print(td[0].text)

if __name__ == '__main__':
  main()

答案 2 :(得分:0)

尝试一下。

import requests
from bs4 import BeautifulSoup

url = "http://baike.hrhrs.com/index.php?doc-view-3967.html"
res = requests.get(url)
data = BeautifulSoup(res.content, 'html.parser')
x for x in data.find_all('td')
    print (x.text)