BeautifulSoup:仅从特定类中获取通用标记

时间:2015-04-27 02:04:28

标签: python-2.7 web-scraping beautifulsoup

当我使用像这样的beautifulsoup时,我从HTML文件中获取了我想要的所有文本:

category = soup.find_all("ol", {"class":"breadcrumb"})
catname = BeautifulSoup(str(category).strip()).get_text().encode("utf-8")

输出:

Home
Digital Goods
E-Books

但我想跳过第一类,即'Home'。我知道我可以简单地用“”代替那个单词,但我的问题是我如何得到beautifulsoup,以便在我上面提到的位置获得一个非常具体的标签

HTML代码如下所示:

<ol class="breadcrumb">
<li><a href="http://fakeshop.com">Home</a></li>
<li><a href="http://fakeshop.com/category/51">Digital Goods</a></li>
<li><a href="http://fakeshop.com/category/98">E-Books</a></li>
</ol>

我可以做些什么来从这个'breadcrumb'部分获取第二个和第三个'li'标签,而不是文件中的其他标签?

示例(不起作用,但说明了我正在寻找的内容):

category = soup.find_all("ol", {"class":"breadcrumb"}), find_all("li")[1:]

1 个答案:

答案 0 :(得分:2)

怎么样:

public class OperatingScreen {

public String ride;

public static JFrame frame = new JFrame();
private static JPanel panel = new JPanel();
private static JPanel button_panel = new JPanel();
private static JLabel bg = new JLabel();


public OperatingScreen(String ride){
    this.ride = ride;

    frame.setTitle("Operating: " + ride);
    panel.setLayout(null);

    frame.add(panel);
    frame.pack();
    frame.setResizable(false);
    frame.setDefaultCloseOperation(WindowConstants.HIDE_ON_CLOSE);
    frame.setVisible(true);
    bg.setIcon(createImageIcon("/background_operating.png", "background"));
    bg.setBounds(1280, 720, 0, 0);
    frame.add(bg);

    //buttons.setBounds(100, 500, 1080, 40);
    frame.setSize(1280, 720);

    SocketHandler.initializeride(ride);
}


public void Initialize(String init){
    String buttons2 = init.split("\\*")[1];
    String[] buttons = buttons2.split("\\|");

    for(int i = 2; i < buttons.length; i++){

        String text = buttons[i].split("\\>")[0];
        String color = buttons[i].split("\\>")[1].substring(0, 1).toUpperCase() + buttons[i].split("\\>")[1].substring(1);;

        JLabel button = new JLabel();
        button.setIcon(createImageIcon("/Button" + color + ".png", "blue"));
        button.setText(text);
        button.setForeground(Color.BLACK);
        button.setFont(button.getFont().deriveFont(17.0f));
        button.setBorder(LineBorder.createBlackLineBorder());
        button_panel.add(button);
        button.addMouseListener(new MouseListener(){

            public void mouseClicked(MouseEvent arg0) {}
            public void mouseEntered(MouseEvent arg0) {
                //button.setIcon(createImageIcon("/Button" + color + ".png", "choose"));
            }
            public void mouseExited(MouseEvent arg0) {
                //button.setIcon(createImageIcon("/Button" + color + ".png", "choose"));
            }
            public void mousePressed(MouseEvent arg0) {}
            public void mouseReleased(MouseEvent arg0) {}
        });

        JOptionPane.showMessageDialog(HomeScreen.frame, text + "    " + color);
    }

    JScrollPane jop = new JScrollPane(button_panel, JScrollPane.VERTICAL_SCROLLBAR_NEVER, JScrollPane.HORIZONTAL_SCROLLBAR_ALWAYS);
    jop.setBounds(100, 500, 1080, 60);
    jop.setBorder(null);

    panel.add(jop);
    jop.setOpaque(false);
    jop.getViewport().setOpaque(false);

    panel.setSize(1280, 720);
    panel.setOpaque(false);
    button_panel.setOpaque(false);
}





protected ImageIcon createImageIcon(String path,
        String description) {
        URL imgURL = getClass().getResource(path);
        if (imgURL != null) {
            return new ImageIcon(imgURL, description);
        } else {
            System.err.println("Couldn't find file: " + path);
            return null;
        }
}
}

我的输出是:

category = soup.find("ol", {"class":"breadcrumb"}).findAll('li')[1:]
catname = BeautifulSoup(str(category).strip()).get_text().encode("utf-8")