解析html代码 - 无法获取预期的数据

时间:2017-09-21 03:15:26

标签: java html jsoup

我正在使用java中的JSoup API来读取html内容并获取存储在标记中的链接。下面是html代码,我想在下面的html代码中将链接存储在以下标记<a onClick="javascript:RefreshPageTo(event, &quot...)"> near <td class="ms-paging">1 - 15</td>;中。

html数据读取文件名(readData.html):

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN">
<html dir="ltr" class="ms-isBot" lang="en-US">
 <head>
  <meta name="GENERATOR" content="Microsoft SharePoint" />
  <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
   <title>
    ALL ELP REPORTS
</title>
  <!-- === Favicon / Windows Tile ==================================================================== -->
  <link rel="shortcut icon" href=" " type="image/vnd.microsoft.icon" id="favicon" />
  <meta name="msapplication-TileImage" content=" " />
  <meta name="msapplication-TileColor" content="#0072C6" />
    <script type="text/javascript" src=" "></script>
  <link rel="stylesheet" type="text/css" href=" " />
  <link id="CssRegistration1" rel="stylesheet" type="text/css" href=" " />
  <link id="CssRegistration2" rel="stylesheet" type="text/css" href=" 0" />
   <script type="text/javascript">CallASP("one.js");
  </script>
  <script type="text/javascript">RegisterSod("strings.js", "\u002f_layouts\u002f15\u002f1033\u002fstrings.js?rev=cG2ZohQxWuyz1\u00252BF2exRTjA\u00253D\u00253D");RegisterSodDep("strings.js", "initstrings.js");

  <link type="text/xml" rel="alternate" href="/_asd.xls" />
  <!-- Additional header placeholder  =========================== -->
  <link rel="alternate" type="application/rss+xml" title="Documents" href="/_layouts/15/listfeed.aspx?List=573d80cd%2D44f6%2D47b4%2D942f%2Da12a5a1841cb" />
  <span id="analytics">
    <script language="JavaScript" type="text/javascript">

  <noscript>
   <div class="noindex">
    You may be trying to access this site from a secured browser on the server. Please enable scripts and reload this page.
   </div>
  </noscript>
  <!-- ===== SP IDs / Prefetch SP images / SP Form  =========================================================================== -->
  <div id="imgPrefetch" style="display:none">
   <img src="/_layouts/15/images/spcommon.png" />
   </div>
  <form method="post" action="./AllItems.aspx?RootFolder=%2fShared+Documents%2f08.Test+Report%2fMY20+Test+Reports%2fSanity%2fRaw+Data&amp;FolderCTID=0x0120003C2FB175ACD9FE42B875BA259F53A6E3&amp;View=%7bF8BC514C-49A5-47A2-8A6D-52DF70D61AE7%7d" id="aspnetForm">
   <input type="hidden" name="_wpcmWpid" id="_wpcmWpid" value="" />
   <input type="hidden" name="wpcmVal" id="wpcmVal" value="" />
   <input type="hidden" name="MSOWebPartPage_PostbackSource" id="MSOWebPartPage_PostbackSource" value="" />

   </script>

        <div id="ctl00_ctl47_asdasd" class="asdaBrandMenu">
         <a href="http://www.qwer.com/" target="_blank"> </a>

         <!-- =============Suite Bar Links ======================-->
       <div id="DeltaSuiteLinks" class="ms-core-deltaSuiteLinks">
        <div id="suiteLinksBox">
         <div id="SuiteLinksHidden" style="display: none">
          </div>
             <div id="launcherIconContainer">
           </div>

       <span style="display:none">
        <menu type="ServerMenu" id="zz1_ID_PersonalActionMenu" hideicons="true">
         <ie:menuitem id="zz2_ID_MyProfile" type="option" onmenuclick="" text="My Profile" menugroupid="100"></ie:menuitem>
         <ie:menuitem id="zz3_ID_Logout" type="option" onmenuclick="" text="Sign Out" description="Logout of this site." menugroupid="100"></ie:menuitem>
        </menu></span>
       <span id="zz4_Menu_t" class="ms-menu-althov ms-welcome-root" title="Open Menu" onmouseover="MMU_PopMenuIfShowing(this);MMU_EcbTableMouseOverOut(this, true)" hoveractive="ms-menu-althov-active ms-welcome-root ms-welcome-hover" hoverinactive="ms-menu-althov ms-welcome-root" onclick=" CoreInvoke('MMU_Open',byid('zz1_ID_PersonalActionMenu'), MMU_GetMenuFromClientId('zz4_Menu'),event,true, null, 0); return false;" foa="MMU_GetMenuFromClientId('zz4_Menu')" oncontextmenu="ClkElmt(this); return false;" style="white-space:nowrap"><a class="ms-core-menu-root" id="zz4_Menu" accesskey="/" href="javascript:;" title="Open Menu" onfocus="MMU_EcbLinkOnFocusBlur(byid('zz1_ID_PersonalActionMenu'), this, true);" onkeydown="MMU_EcbLinkOnKeyDown(byid('zz1_ID_PersonalActionMenu'), MMU_GetMenuFromClientId('zz4_Menu'), event);" onclick=" CoreInvoke('MMU_Open',byid('zz1_ID_PersonalActionMenu'), MMU_GetMenuFromClientId('zz4_Menu'),event,true, null, 0); return false;" oncontextmenu="ClkElmt(this); return false;" menutokenvalues="MENUCLIENTID=zz4_Menu,TEMPLATECLIENTID=zz1_ID_PersonalActionMenu" serverclientid="zz4_Menu">Bhavani Borra<span class="ms-accessible">Use SHIFT+ENTER to open the menu (new window).</span></a><span style="height:4px;width:7px;position:relative;display:inline-block;overflow:hidden;" class="s4-clust ms-viewselector-arrow ms-menu-stdarw ms-core-menu-arrow"><img src="/_catalogs/theme/Themed/EB5E82F/spcommon-B35BB0A9.themedpng?ctag=3" alt="Open Menu" style="position:absolute;left:-95px !important;top:-259px !important;" /></span><span style="height:4px;width:7px;position:relative;display:inline-block;overflow:hidden;" class="s4-clust ms-core-menu-arrow ms-viewselector-arrow ms-menu-hovarw"><img src="/_catalogs/theme/Themed/EB5E82F/spcommon-B35BB0A9.themedpng?ctag=3" alt="Open Menu" style="position:absolute;left:-86px !important;top:-259px !important;" /></span></span>
      </div>
      <!-- ======== Start: Site Actions menu ============= -->
      <div id="suiteBarButtons">
       <span class="ms-siteactions-root" id="siteactiontd"> <span style="display:none">
         <menu type="ServerMenu" id="zz5_FeatureMenuTemplate1" hideicons="true">
          <ie:menuitem id="zz6_MenuItem_ShareThisSite" type="option" onmenuclick="" description="See who's here and invite new people." menugroupid="100"></ie:menuitem>
          <ie:menuitem id="zz7_MenuItem_ViewAllSiteContents" type="option" iconsrc="" onmenuclick="STSNavigate2(event,'/_layouts/15/viewlsts.aspx');" text="Site contents" description="View all libraries and lists in this site." menugroupid="200"></ie:menuitem>
         </menu></span><span id="zz8_SiteActionsMenu_t" class="ms-siteactions-normal" title="Settings" onmouseover="MMU_PopMenuIfShowing(this);MMU_EcbTableMouseOverOut(this, true)" hoveractive="ms-siteactions-normal ms-siteactions-hover" hoverinactive="ms-siteactions-normal">
         <a class="ms-core-menu-root" id="zz8_SiteActionsMenu" accesskey="/" href="javascript:;" title="Settings" onkeydown="MMU_EcbLinkOnKeyDown(byid('zz5_FeatureMenuTemplate1'), MMU_GetMenuFromClientId('zz8_SiteActionsMenu'));" menutokenvalues="MENUCLIENTID=zz8_SiteActionsMenu,TEMPLATECLIENTID=zz5_FeatureMenuTemplate1" serverclientid="zz8_SiteActionsMenu"><span class="ms-siteactions-imgspan"><img class="ms-core-menu-buttonIcon" src="/_catalogs/theme/Themed/EB5E82F/Settings-white-94FE89A9.themedpng?ctag=3" alt="Settings" title="Settings" /></span><span class="ms-accessible">Use SHIFT+ENTER to open the menu (new window).</span></a></span> </span>
      </div>
      <!-- ================== End: Site Actions Menu ============================================ -->
      <!-- ================== IT Help Link ============================================ -->

          <div class="ms-core-listMenu-verticalBox">
          </div>
         </div>
        </div>
       </div>
       <!-- ===== Main Content ========================================================================================== -->

       <tr class="ms-alternating ms-itmhover" iid="47,1430,0">
        <td class="ms-vb-itmcbx ms-vb-firstCell"><input type="checkbox" class="s4-itm-cbx" /></td>
        <td class="ms-vb-icon"><img border="0" alt="ECS-dailyTask.xls" title="ECS-dailyTask.xls" src="" /></td>
        <td height="100%" onmouseover="OnChildItem(this)" class="ms-vb-title">
         <div class="ms-vb itx" onmouseover="OnItem(this)" ctxname="ctx47" id="1430" field="LinkFilename" perm="0x1b03c4312ef" eventtype="">
          <a onfocus="OnLink(this)" href="/MyDocuments/ECS-dailyTask.xls" onmousedown="">ECS-dailyTask</a>
         </div>
         <div class="s4-ctx" onmouseover="OnChildItem(this.parentNode); return false;">
          <span>&nbsp;</span>
          <a onfocus="OnChildItem(this.parentNode.parentNode); return false;" onclick="" href="javascript:;" title="Open Menu"></a>
          <span>&nbsp;</span>
         </div></td>
        <td class="ms-vb2">
         <nobr>
          3/31/2013 11:04 AM
         </nobr></td>

       <tr class="ms-alternating ms-itmhover" iid="47,1429,0">
         <td class="ms-vb-itmcbx ms-vb-firstCell"><input type="checkbox" class="s4-itm-cbx" /></td>
         <td class="ms-vb-icon"><img border="0" alt="ECS-MontlhyTask.xls" title="ECS-MontlhyTask.xls" src="/_layouts/15/images/icxls.png?rev=23" /></td>
         <td height="100%" onmouseover="OnChildItem(this)" class="ms-vb-title">
          <div class="ms-vb itx" onmouseover="OnItem(this)" ctxname="ctx47" id="1429" field="LinkFilename" perm="0x1b03c4312ef" eventtype="">
           <a onfocus="OnLink(this)" href="/MyDocs/ECS-MontlhyTask.xls" onmousedown="">ECS-MontlhyTask</a>
          </div>
          <div class="s4-ctx" onmouseover="OnChildItem(this.parentNode); return false;">
           <span>&nbsp;</span>
           <a onfocus="" onclick="" href="javascript:;" title="Open Menu"></a>
           <span>&nbsp;</span>
          </div></td>
         <td class="ms-vb2">
          <nobr>
           7/24/2016 10:09 PM
          </nobr></td>
         <td class="ms-vb-user"><span class="ms-noWrap"><span class="ms-imnSpan"><a href="#" onclick="" class="ms-imnlink ms-spimn-presenceLink">
         <span class="ms-spimn-presenceWrapper ms-imnImg ms-spimn-imgSize-10x10">
         <img name="imnmark" class="" title="" showofflinepawn="1" src="" alt="No presence information" id="imn_16532,type=sip" />
         </span></a></span><span class="ms-noWrap ms-imnSpan">
         <a href="#" onclick="" class="ms-imnlink" tabindex="-1"><img name="imnmark" class="ms-hide" title="" showofflinepawn="1" src="" alt=""/></a>
         <a class="ms-subtleLink" onclick="" href="/_layouts/15/userdisp.aspx?ID=113">ASDF</a></span></span></td>
    </tr>
    ..
    ...
  <table width="100%" border="0" cellpadding="0" cellspacing="0" class="ms-bottompaging" xmlns:x="http://www.w3.org/2001/XMLSchema" xmlns:d="http://schemas.microsoft.com/" xmlns:asp="http://schemas.microsoft.com/ASPNET/20" xmlns:pcm="urn:PageContentManager" xmlns:ddwrt2="urn:frontpage:internal">
     <tbody>
      <tr>
       <td class="ms-bottompagingline1"><img src="/_images/11/images/blank.gif?rev=40" width="1" height="1" alt="" /></td>
      </tr>
      <tr>
       <td class="ms-bottompagingline2"><img src="/_images/11/images/blank.gif?rev=40" width="1" height="1" alt="" /></td>
      </tr>
      <tr>
       <td class="ms-vb" id="bottomPagingCellWPQ2" align="center">
        <table>
         <tbody>
          <tr>
           <td class="ms-paging">1 - 15</td>
           <td><a onclick="javascript:RefreshPageTo(event, &quot;/sites/myAppDetail/My%20Documents/Forms/AllApplicationss.aspx?Paged=TRUE&amp;p_SortBehavior=0&amp;p_FileLeafRef=LT%5fSW%20TEAM%5fNatural%5fItemCode%5f20170909%5fvstatus%2epdf&amp;p_ID=85&amp;RootFolder=%2fmyData%2fFolder3%2fCommon%20Docs%2fdaily%20Report%2f2017&amp;PageFirstRow=16&amp;&amp;View={05465DFA-110E-21FC-8AD6-8B9846567FF8B}&quot;);javascript:return false;" href="javascript:"><img src="/_layouts/15/1011/images/next.gif" border="0" alt="Next" /></a></td>
          </tr>
         </tbody>
        </table></td>
      </tr>
  <tr>

示例java代码(未给出预期的输出):

      import org.jsoup.Jsoup;
      import org.jsoup.nodes.Document;
      import org.jsoup.nodes.Element;
      import org.jsoup.select.Elements;
      import java.io.File;
      import java.io.IOException;
      public class ReadFileNamesFromHTMLContent {
      public static void main(String args[]) throws IOException {
              File input = new File("readData.html");
              Document doc = Jsoup.parse(input, "UTF-8");
              Elements links = doc.select(".ms-paging > td > a"); //get the value stored inside <a onClick="javascript:RefreshPageTo(event, &quot...)"> near  <td class="ms-paging">1 - 15</td>;
              System.out.println("size : "+ links.size()); //0
              for (Element link : links) {
                  System.out.println(link);//empty, it should print the link
              }
          }
    }

上述代码的输出:

size:0

预期产出:

&quot;/sites/myAppDetail/My%20Documents/Forms/AllApplicationss.aspx?Paged=TRUE&amp;p_SortBehavior=0&amp;p_FileLeafRef=LT%5fSW%20TEAM%5fNatural%5fItemCode%5f20170909%5fvstatus%2epdf&amp;p_ID=85&amp;RootFolder=%2fmyData%2fFolder3%2fCommon%20Docs%2fdaily%20Report%2f2017&amp;PageFirstRow=16&amp;&amp;View={05465DFA-110E-21FC-8AD6-8B9846567FF8B}&quot

有关如何获得预期输出的任何建议都会有所帮助。

PS:不应该修改HTML代码。

1 个答案:

答案 0 :(得分:0)

这是你的HTML:

<td class="ms-paging">1 - 15</td>
<td><a onclick="javascript:RefreshPageTo(event, &quot;/sites/myAppDetail/My%20Documents/Forms/AllApplicationss.aspx?Paged=TRUE&amp;p_SortBehavior=0&amp;p_FileLeafRef=LT%5fSW%20TEAM%5fNatural%5fItemCode%5f20170909%5fvstatus%2epdf&amp;p_ID=85&amp;RootFolder=%2fmyData%2fFolder3%2fCommon%20Docs%2fdaily%20Report%2f2017&amp;PageFirstRow=16&amp;&amp;View={05465DFA-110E-21FC-8AD6-8B9846567FF8B}&quot;);javascript:return false;" href="javascript:"><img src="/_layouts/15/1011/images/next.gif" border="0" alt="Next" /></a></td>

但你的选择器是

 Elements links = doc.select(".ms-paging > td > a");

但是,带有javascript onclick属性的td元素不是您选择的td的子元素。所以你只需要选择链接。也许尝试这样的事情:

 Elements links = doc.select("td > a[onclick]");