如何使用BeautifulSoup和Python仅选择此文本节点?

时间:2015-07-30 09:40:32

标签: python beautifulsoup data-extraction

我有这个html结构:

angular.module('starter.controllers', [])

.controller('DashCtrl', function($scope, $rootScope, $ionicUser, $ionicPush) {


  // // Handles incoming device tokens
  $rootScope.$on('$cordovaPush:tokenReceived', function(event, data) {
    alert("Successfully registered token " + data.token);
    console.log('Ionic Push: Got token ', data.token, data.platform);
    $scope.token = data.token;
  });

  // // Identifies a user with the Ionic User service
  $scope.identifyUser = function() {
    console.log('Ionic User: Identifying with Ionic User service');

    var user = $ionicUser.get();
    if(!user.user_id) {
      // Set your user_id here, or generate a random one.
      user.user_id = $ionicUser.generateGUID();
    };

    // Add some metadata to your user object.
    angular.extend(user, {
      name: 'Ionitron',
      bio: 'I come from planet Ion'
    });

    // Identify your user with the Ionic User Service
    $ionicUser.identify(user).then(function(){
      $scope.identified = true;
      alert('Identified user ' + user.name + '\n ID ' + user.user_id);
    });
  };

   $scope.pushRegister = function() {
    console.log('Ionic Push: Registering user');
    alert('bla');
    // Register with the Ionic Push service.  All parameters are optional.
    $ionicPush.register({ //ERROR FIRES HERE
      canShowAlert: true, //Can pushes show an alert on your screen?
      canSetBadge: true, //Can pushes update app icon badges?
      canPlaySound: true, //Can notifications play a sound?
      canRunActionsOnWake: true, //Can run actions outside the app,
      onNotification: function(notification) {
        // Handle new push notifications here
        // console.log(notification);
        return true;
      }
    });
    alert('bla2');
  };
})

在我的python脚本中,我写道:

<div class="foo">
    <h3>Title</h3>
    <br>Some text I want to retrieve. <br><br> This text too.
    <br> (numbers and position of "br" tag indetermined) And this one too.
    <div class="subfoo">Some other text I don't want.</div>
</div>

正如所料,我得到了全文:

exampleSoup = bs4.BeautifulSoup(res.text, "html.parser")
elems = exampleSoup.select('.foo')
print(elems[0].getText())

如何只获得div中没有​​标记的字符串,即:“我想要检索的一些文本。这个文本也是。这个也是。”? 谢谢你的帮助。

1 个答案:

答案 0 :(得分:1)

您可以使用.next_sibling  获取树中的下一个元素。

示例

>>> soup = BeautifulSoup(html)
>>> print soup.prettify()
<html>
 <body>
  <div class="foo">
   <h3>
    Title
   </h3>
   Some text I want to retrieve.
   <div class="subfoo">
    Some other text I don't want.
   </div>
  </div>
 </body>
</html>

>>> print soup.find('div', { 'class' : 'foo' } ).h3.next_sibling.strip()
Some text I want to retrieve.