JSoup没有完全呈现长页面

时间:2019-12-27 03:23:45

标签: java android jsoup

有一个website,我试图从JSoup库中获取信息。问题在于响应中仅页面的一部分。

这是我的代码:

Connection.Response loginForm = Jsoup.connect(
        String.format("https://student.utm.utoronto.ca/timetable/timetable?yos=%s&session=20199", i))
        .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36")
        .method(Connection.Method.GET)
        .timeout(600000)
        .header("Accept-Encoding", "gzip, deflate, br")
        .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
        .header("Accept-Language", "en-US,en;q=0.9,fr;q=0.8")
        .header("Cache-Control","max-age=0")
        .header("Connection", "keep-alive")
        .header("Host", "student.utm.utoronto.ca")
        .header("Sec-Fetch-Mode","navigate")
        .execute();

我尝试从浏览器的请求标头中复制尽可能多的标头。

我怀疑最初加载页面后运行了javascript,这就是为什么我没有在响应中得到整个页面的原因。

这是带有tr标签的最后一个元素:

      <tr id="tr_MAT102H5FTUT0125" class=" TUT W1 meeting_section OL0 todAft"> 
       <td> </td> 
       <td> <label for="MAT102H5FTUT0125">TUT0125</label> </td> 
       <td class="instrTD"> </td> 
       <td class="hideEnrolTD"> 29 </td> 
       <td class="hideEnrolTD"> 35 </td> 
       <td class="hideEnrolTD"> 0 </td> 
       <td class="enrolTD"> 29/35, wait:0 </td> 
       <td> <abbr title="Wednesday">WE</abbr><br> </td> 
       <td class="start_time"> 13:00<br> </td> 
       <td> 14:00<br> </td> 
       <td> MN 2100<br> </td> 
       <td class="noteTD"> </td> 
       <td class="text-right"> 
        <!-- <input type='checkbox' name='courses' id='$checkbox_id' value='MAT102H5F-TUT0125-313:0014:00' aria-label='MAT102H5F TUT0125' /> --> <button name="courses" id="MAT102H5FTUT0125" value="MAT102H5F-TUT0125-313:0014:00" onclick="addCourse($(this))" aria-label="MAT102H5F TUT0125" class="addButton btn btn-sm btn-primary"><span class="glyphicon glyphicon-plus"></span><span class="button-text"> Add to Plan</span></button> </td> 
      </tr> 
      <tr id="tr_MAT102H5FTUT0126" class=" TUT W1 meeting_section OL0 todAft"> 
       <td> </td> 
       <td> <label for="MAT102H5FTUT0126">TUT0126</label> </td> &lt;
      </tr>
     </tbody>
    </table>
   </div>
  </div>
 </body>
</html>

我希望最后一个tr标记表示页面中加载的最后一个表,即课程WGS102H5S而不是MAT102H5F

不确定这是否有用,但是我目前正在android studio中进行编程。

JSoup中对此有任何修复程序吗?如果没有,我应该使用更好的库吗?

1 个答案:

答案 0 :(得分:0)

const subBtn = document.getElementById("btn"); const inptTxt = document.getElementById("text"); const contDiv = document.getElementById("container"); //check input box is empty or not and enable Colorize button subBtn.disabled = true // FIX: make a dedicated function for validation function validateInput() { const value = inptTxt.value.trim() if (value) { inptTxt.dataset.state = 'valid' subBtn.disabled = false } else { inptTxt.dataset.state = 'invalid' subBtn.disabled = true } } // FIX: call the validation function when the input is changed inptTxt.addEventListener('input', validateInput); var xhttp = new XMLHttpRequest(); subBtn.addEventListener("click", function () { xhttp.onload = function () { var crd = document.createElement("div"); var div = contDiv.getElementsByTagName("div"); var crdCount = div.length; crd.setAttribute("id", "card" + crdCount); crd.innerHTML = inptTxt.value; //change html title with input text document.title = inptTxt.value; contDiv.appendChild(crd); document.getElementById("card" + crdCount).style.background = JSON.parse(this.responseText).color; //getting inverse value of hex color const inv = (hex) => '#' + hex.match(/[a-f0-9]{2}/ig).map(e => (255 - parseInt(e, 16) | 0).toString(16).replace(/^([a-f0-9])$/, '0$1')).join('') const invert = () => document.querySelectorAll('circle') .forEach(c => (hex = c.getAttribute('fill')) && c.setAttribute('fill', inv(hex)) ) var invColor = inv(JSON.parse(this.responseText).color); //setting the inverse value to card text var crdText = document.getElementById("card" + crdCount).innerHTML; var setColor = "<span style='color:" + invColor + "'>" + crdText + "</span>"; document.getElementById("card" + crdCount).innerHTML = setColor; //clear input and refocus inptTxt.value = ""; inptTxt.focus(); inptTxt.select(); // FIX: call the validation function when the input is cleared validateInput(); }; xhttp.onerror = function () { var crd = document.createElement("div"); var div = contDiv.getElementsByTagName("div"); var crdCount = div.length; crd.setAttribute("id", "card" + crdCount); crd.innerHTML = inptTxt.value; //change html title with input text document.title = inptTxt.value; contDiv.appendChild(crd); document.getElementById("card" + crdCount).style.background = "#6d4298"; var crdText = document.getElementById("card" + crdCount).innerHTML; var setColor = "<span style='color:#ffffff'>" + crdText + "</span>"; document.getElementById("card" + crdCount).innerHTML = setColor; }; xhttp.open("GET", "http://api.creativehandles.com/getRandomColor"); xhttp.send(); }) //submit quote using Enter key inptTxt.addEventListener("keyup", function (event) { if (event.keyCode === 13) { event.preventDefault(); subBtn.click(); } }); 更改为.execute(); https://jsoup.org/apidocs/org/jsoup/Connection.html#maxBodySize-int- https://jsoup.org/apidocs/org/jsoup/Connection.Response.html#bufferUp--