在android studio中获取具有相同名称类jsoup的下一个元素

时间:2016-02-14 13:49:13

标签: java android html jsoup

我想获得html中具有相同名称类的下一个元素。 html标签就像:

HTML:

  <section class="post">
        <img class="pecintakomik" src="/images/top/op.jpg" alt="pecintakomik.com" />
            <div class="post-cnt">
                <h2>Manga bla bla</h2>
                    <ul>
                    <li><strong>Nama Alternatif:</strong> </li>
                    <li><strong>Tahun Rilis:</strong> 2010</li>
                    <li><strong>Author(s):</strong> sensei1
                    <li><strong>Artist(s):</strong> sense2</li>
                    <li><strong>Genre:</strong> Action</li>
                    <li><strong>Sinopsis:</strong> bla bla bla </li>
                    <li><span class='st_facebook_hcount' displayText='Facebook'></span> <span class='st_twitter_hcount' displayText='Tweet'></span> <span class='st_sharethis_hcount' displayText='ShareThis'></span></li>                      
                    </ul>
            </div>
                <div class="clear">&nbsp;</div>
    </section>
    <img src="http://www.pecintakomik.com/images/block.png">
    <section class="post">
        <div class="post-cnt">
            <h2>List Chapter(s)</h2>
            <ul>
                <li><a href="/manga/bla_bla/816"> bla bl 816 <img src="/images/new.gif"><em>Baca Online </em></a></li>
                <li><a href="/manga/bla_bla/815"> bla bla 815<em>Baca Online </em></a></li>
                <li><a href="/manga/bla_bla/814"> bla bla 814<em>Baca Online </em></a></li>
                <li><a href="/manga/bla_bla/813"> bla bla 813<em>Baca Online </em></a></li>
            </ul>
       </div>
    </section>

我的代码是获取列表漫画的href链接(并将其存储在sqllite上),但我无法得到它:

java代码:

private List<Chapter> parseHtmlToChapters(RequestWrapper request, String unparsedHtml) {
    int beginIndex = unparsedHtml.indexOf("<div class=\"post-cnt\">");
    int endIndex = unparsedHtml.indexOf("</div>", beginIndex);

    String trimmedHtml = unparsedHtml.substring(beginIndex, endIndex);

    Document parsedDocument = Jsoup.parse(trimmedHtml);


    List<Chapter> chapterList = scrapeChaptersFromParsedDocument(parsedDocument);
    chapterList = setSourceForChapterList(chapterList);
    chapterList = setParentUrlForChapterList(chapterList, request.getUrl());
    chapterList = setNumberForChapterList(chapterList);

    saveChaptersToDatabase(chapterList, request.getUrl());

    return chapterList;
}

private List<Chapter> scrapeChaptersFromParsedDocument(Document parsedDocument) {
    List<Chapter> chapterList = new ArrayList<Chapter>();

    Element chapterElementnya = parsedDocument.select("div.post-cnt").get(1);
    Elements chapterElements = chapterElementnya.getElementsByTag("li");


    for (Element chapterElement : chapterElements) {
        Chapter currentChapter = constructChapterFromHtmlBlock(chapterElement);

        chapterList.add(currentChapter);
    }

    return chapterList;
}

private Chapter constructChapterFromHtmlBlock(Element chapterElement) {
    Chapter newChapter = DefaultFactory.Chapter.constructDefault();

    Element urlElement = chapterElement.select("a").first();
    Element nameElement = chapterElement.select("a").first();

    if (urlElement != null) {
        String fieldUrl = "http://www.pecintakomik.com" + urlElement.attr("href");
        newChapter.setUrl(fieldUrl);
    }
    if (nameElement != null) {
        String fieldName = nameElement.text();
        newChapter.setName(fieldName);
    }

    boolean fieldNew = chapterElement.html().contains("<img src=\"/images/new.gif\">");
    newChapter.setNew(fieldNew);

    return newChapter;
}

请有人知道如何获得具有相同名称的第二类列表吗?

2 个答案:

答案 0 :(得分:2)

此代码:

public class LapRekapInput extends javax.swing.JPanel {

    private Connection koneksi;
    private Date tglSurat;
    private String varSub1 = "N";
    private String varSub2 = "N";
    private String varSub3 = "N";
    private String varSub4 = "N";
    private PreparedStatement ps;

    private DynamicTableModel<LapRekInput> tableModel;

    /**
     * Creates new form LapRekapitulasiInput
     */
    public LapRekapInput() {
        koneksi = DatabaseUtilitas.getkoneksi();
        initComponents();
        tableModel = new DynamicTableModel<>(LapRekInput.class);
        tablelapRekap.setDynamicModel(tableModel);

    }

    private void LoadLapRekInput(){
       tglSurat = jDateChooser1.getDate();
            if(jComboBoxSubdit.getSelectedItem().equals("Subdit Mogok Kerja Dan Deteksi Dini")){
                varSub1 = "Y";
            }else if(jComboBoxSubdit.getSelectedItem().equals("Subdit Penyelesaian Perselisihan Hubungan Industrial")){
                varSub2 = "Y";
            }else if(jComboBoxSubdit.getSelectedItem().equals("Subdit Kelembagaan PPHI")){
                varSub3 = "Y";
            }else if(jComboBoxSubdit.getSelectedItem().equals("Subag Tata Usaha")){
                varSub4 = "Y";
            }


         String sql = "select NoAgenda, asalSurat, tglSurat, NoSurat, Perihal from tahap2 "
                    + " WHERE tglSurat = '"+new java.sql.Date(tglSurat.getTime())+"' "
                    + "and dtrsknKpd1 IN ('"+varSub1+"') "
                    + " and dtrsknKpd2 IN ('"+varSub2+"') "
                    + " and dtrsknKpd3 IN ('"+varSub3+"') "
                    + " and dtrsknKpd4 IN ('"+varSub4+"') ";


          try {
            ps = koneksi.prepareStatement(sql);
            ResultSet rs = ps.executeQuery(sql);

                List<LapRekInput> list = (List<LapRekInput>) rs;
                for(LapRekInput rekInput : list){
                    tableModel.add(rekInput);
                }

        } catch (SQLException ex) {
            System.out.println("Error" + ex.getMessage());
        }
    }
}

仅保留第一个列表。 private List<Chapter> parseHtmlToChapters(RequestWrapper request, String unparsedHtml) { int beginIndex = unparsedHtml.indexOf("<div class=\"post-cnt\">"); int endIndex = unparsedHtml.indexOf("</div>", beginIndex); String trimmedHtml = unparsedHtml.substring(beginIndex, endIndex); ... } 将包含以下内容:

trimmedHtml

要保留两个列表,您可以执行以下操作:

<div class="post-cnt">
    <h2>Manga bla bla</h2>
    <ul>
        <li><strong>Nama Alternatif:</strong> </li>
        <li><strong>Tahun Rilis:</strong> 2010</li>
        <li><strong>Author(s):</strong> sensei1
        <li><strong>Artist(s):</strong> sense2</li>
        <li><strong>Genre:</strong> Action</li>
        <li><strong>Sinopsis:</strong> bla bla bla </li>
        <li><span class='st_facebook_hcount' displayText='Facebook'></span> <span class='st_twitter_hcount' displayText='Tweet'></span> <span class='st_sharethis_hcount' displayText='ShareThis'></span></li>                      
    </ul>
</div>

但解析整个页面会更安全。为此,请更改:

int beginIndex = unparsedHtml.indexOf("<div class=\"post-cnt\">");
int secondListStart = unparsedHtml.indexOf("<div class=\"post-cnt\">",beginIndex + "<div class=\"post-cnt\">".length());
int endIndex = unparsedHtml.indexOf("</div>", secondListStart) + "</div>".length();

String trimmedHtml = unparsedHtml.substring(beginIndex, endIndex);

致:

Document parsedDocument = Jsoup.parse(trimmedHtml);

答案 1 :(得分:0)

试试这个

private List<Chapter> parseHtmlToChapters(RequestWrapper request, String unparsedHtml) {

    Document parsedDocument = Jsoup.parse(unparsedHtml);

    List<Chapter> chapterList = new ArrayList<>();

    for (Element a : parsedDocument.select("div.post-cnt a")) {
        Chapter newChapter = DefaultFactory.Chapter.constructDefault();
        newChapter.setUrl("http://www.pecintakomik.com" + a.attr("href"));
        newChapter.setName(a.text());
        newChapter.setNew(!a.select("img[src=/images/new.gif]").isEmpty());
        chapterList.add(newChapter);
    }
    // .....

parsedDocument.select("div.post-cnt a")选择所有<a>元素下的所有<div class="post-cnt">元素。您的示例HTML中有四个这样的元素。