使用cheerio和节点

时间:2017-10-12 17:59:24

标签: jquery node.js cheerio

我正在尝试对网页进行网页抓取并将其部分内容转换为JSON,以便以后可以通过wordpress读取。但是,我似乎无法正确循环列表项。

例如,我想循环使用:

<div class="acalog-core">
            <h4><a name="GEC01WrittenCommunication6Hours"></a><a name="gec01writtencommunication6hours" id="core_95236"></a>GEC 01. Written Communication (6 hours)</h4>
            <hr>
            <ul>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87026',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95236~;}'); return false;">ENG 101 - Composition One</a>  3 hrs.</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87027',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95236~;}'); return false;">ENG 102 - Composition Two</a>  3 hrs.</span></li>
            </ul>
        </div>
        <div class="acalog-core">
            <h4><a name="GEC02NaturalScience810HoursMinimum"></a><a name="gec02naturalscience810hoursminimum" id="core_95238"></a>GEC 02. Natural Science (8-10 hours minimum)</h4>
            <hr>
            <p>Select 2 courses with labs:</p>
            <ul>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86280',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 111 - General Astronomy I</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86281',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 111L - General Astronomy I Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86282',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 112 - General Astronomy II</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86283',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">AST 112L - General Astronomy II Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86284',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 103 - Biology and Society</a>  3 hrs. ◊ AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86285',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 103L - Biology and Society Laboratory</a>  1 hr ◊</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86290',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 110 - Principles of Biological Science I</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86291',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 110L - Principles of Biological Science I Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86292',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 111 - Principles of Biological Science II</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86293',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 111L - Principles of Biological Science II Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86299',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 250 - Human Anatomy and Physiology I</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86300',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 250L - Human Anatomy and Physiology I Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86301',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 251 - Human Anatomy and Physiology II</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86302',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">BSC 251L - Human Anatomy and Physiology II Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86484',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 104 - Chemistry and Our Environment</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86485',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 104L - Chemistry and Our Environment Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86486',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 106 - General Chemistry I</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86487',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 106L - General Chemistry I Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86488',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 107 - General Chemistry II</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '86489',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">CHE 107L - General Chemistry II Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87208',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 104 - Weather and Climate</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87719',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 104L - Weather and Climate Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87209',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 105 - Land and Water</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87720',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GHY 105L - Land and Water Lab</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87237',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 101 - Physical Geology</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87749',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 101L - Physical Geology Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87750',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 103 - Historical Geology</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87238',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">GLY 103L - Historical Geology Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87507',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">MAR 151 - Introduction to Ocean Science</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87508',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">MAR 151L - Introduction to Ocean Science Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88573',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 103 - Introductory Physics</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88574',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 103L - Introductory Physics Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88575',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 111 - General Physics I</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88576',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 111L - General Physics I Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88577',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 112 - General Physics II</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88578',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 112L - General Physics II Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88580',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 201 - General Physics I with Calculus</a>  4 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88581',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 201L - General Physics I with Calculus Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88582',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 202 - General Physics II with Calculus</a>  4 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88583',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PHY 202L - General Physics II with Calculus Laboratory</a>  1 hr.</span></li>
                <li class="acalog-adhoc acalog-adhoc-after" style="list-style: none;">
                    <p>&#160;</p>
                </li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88661',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PSC 190 - Living in a Material World</a>  3 hrs. AND</span></li>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88662',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95238~;}'); return false;">PSC 190L - Lab for Living in a Material World</a>  1 hr.</span></li>
            </ul>
        </div>
        <div class="acalog-core">
            <h4><a name="GEC03Humanities9Hours"></a><a name="gec03humanities9hours" id="core_95240"></a>GEC 03. Humanities (9 hours)</h4>
            <hr>
            <ul>
                <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87537',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95240~;}'); return false;">ENG 203 - World Literature</a>  3 hrs.</span></li>
            </ul>
        </div>
        <div style="padding-left: 20px;">
            <div class="acalog-core">
                <h5><a name="Select2Courses1HistoryRequired"></a><a name="select2courses1historyrequired" id="core_95241"></a>Select 2 courses, 1 History required:</h5>
                <hr>
                <ul>
                    <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87272',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">HIS 101 - World Civilizations: Beginnings to 1500 C.E.</a>  3 hrs.</span></li>
                    <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '87273',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">HIS 102 - World Civilizations: 1500 to the present</a>  3 hrs.</span></li>
                    <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88541',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">PHI 151 - Introduction to Philosophy</a>  3 hrs.</span></li>
                    <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88542',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">PHI 171 - Ethics and Good Living</a>  3 hrs.</span></li>
                    <li class="acalog-course"><span><a href="#" onClick="showCourse('17', '88756',this, 'a:2:{s:8:~location~;s:7:~program~;s:4:~core~;s:5:~95241~;}'); return false;">REL 131 - Comparative Religion</a>  3 hrs.</span></li>
                </ul>
            </div>

注意这看起来非常凌乱,而且确实如此。但我必须解析每个列表并使用<h4>标题作为标题,然后在每个<h4>下单独放置列表项,以便JSON文件如下所示:

[
 title: 'GEC 01',
 courseid: [
    'ENG 101',
    'ENG 102',
    ... etc
 ],
 coursetitle: [
    'Composition One  3 hrs.',
    'Composition Two  3 hrs.',
    ... etc
 ],
 labid: [
    //follows same format as above, if no lab this is empty
 ],
 labtitle: [
    //follows same format as above, if no lab this is empty
 ]

我目前正在将对象推送到一个数组中,然后在将数据全部解析为JSON文件类型之后输出该数组。

我目前的循环结构如下:

$('h4').each(function(i, elem){ // -- Looks for <h4> tagged items
  let data = new courses('');
  if($(this).text() !== '') { // -- Check to make sure <h4> isn't empty
    data.title = $(this).text(); // Set title to <h4> text
  }

    $('li').each(function (j, ele) { // -- Looks for <li> tagged items in <h4> items
      input = $(ele).find('span').text().split(" - "); // -- Get <span> text items and split them into input[0] and input[1]
      courseid = input[0]; // -- Puts for example CSC 101 into here
      coursename = input[1]; // -- Puts for example 'Introduction to computer science' into here

      // -- Make sure there is actual data --
      if (coursename !== '' && courseid !== '' && coursename !== undefined && coursename !== null) {

        // -- If there is a lab enter here
        if (courseid.indexOf("L", 7) !== -1) { // indexOf searches for char value, the 7 specifies location
          data.addli(courseid); // Input course value here

          //=========================Check for And || or to remove=========================
          if (coursename.indexOf("AND") !== -1 || coursename.indexOf("or") !== -1) {
            if (coursename.indexOf("AND") !== -1) { // Replace AND with nothing
              data.addln(coursename.replace(' AND', ''));
            }

            else { // Otherwise remove or
              data.addln(coursename.replace('or', ''));
            }
          }

          else { // -- If no 'AND' or 'or' then come here and push lab name
            data.addln(coursename);
          }
          //=========================END of check==========================================
        }

        // -- If there is no lab, enter here to push course
        else {
          data.addci(courseid);

          //=========================Check for And || or to remove=========================
          if (coursename.indexOf("AND") !== -1 || coursename.indexOf("or") !== -1) {
            if (coursename.indexOf("AND") !== -1) { // Replace AND with nothing
              data.addcn(coursename.replace(' AND', ''));
            }

            else { // Otherwise remove or
              data.addcn(coursename.replace('or', ''));
            }
          }

          else { // -- If no 'AND' or 'or' then come here and push lab name
            data.addcn(coursename);
          }
        }
      }
    });
    parsedResults.push(data);
});

我遇到的问题是li项目都被传入一次,我似乎无法根据它们属于哪个h4将它们分开。所以我的问题是,有没有办法阻止在特定li标签的末尾嵌套每个循环,然后转到下一个ul分组,然后获取这些项目?希望这是有道理的。

1 个答案:

答案 0 :(得分:0)

假设您要抓取其网址为abc.com的网站 npm install x-ray

  Location mlocation;

    @Override 
    public void onLocationChanged(Location location) { 
        // Add a marker in Sydney and move the camera 
        mLocation = location; 
        LatLng myLocation = new LatLng(mLocation.getLatitude(), mLocation.getLongitude());
        mMap.addMarker(new MarkerOptions()
                .position(myLocation)
                .icon(BitmapDescriptorFactory.defaultMarker(BitmapDescriptorFactory.HUE_RED))
                .title("My Location"));
        mMap.moveCamera(CameraUpdateFactory.newLatLng(myLocation));
        Log.d("location", "Latitude:" + mLocation.getLatitude() + "\n" + "Longitude:" + mLocation.getLongitude());
    }