Question

I'm scraping some chinese websites for a research project and have run into a problem that I just can't crack. Using scrapy, I'm trying to follow the link in an tag with the class name "up" (the tag is one of many tags in a nav bar). The problem is that neither CSS or Xpath selectors are able to "grab" the tag. I'm able to fetch it's parent div tag just fine, but when I try to access its children, scrapy returns an empty list. Here is the structure of the parent div and it's child tags:

    <div class="xz-page clearfix" style="margin-left:160px;">
        <script type="text/javascript"> ... </script>
        <a class="pre" href="javascript:void(0)" style="width:40px">上一页</a>
        <a class="num sel" href="javascript:void(0)">1</a>
        <a class="num" href="index_1.html">2</a>
        <a class="num" href="index_2.html">3</a>
        <a href="javascript:void(0)" style="border:none">...</a>
        <a class="num" href="index_697.html">698</a>
        <a class="num" href="index_698.html">699</a>
        <a class="up" href="index_1.html" style="width:40px">下一页</a>&nbsp;
    </div>

The link I'm trying to grab is the one in the last .

When I use the selector:

next_page = response.xpath('//div[@class = "xz-page clearfix"])

It returns this:

[<Selector xpath='//div[@class = "xz-page clearfix"]' data='<div class="xz-pageclearfix" style="mar'>]

But when I try to access the tag with this:

next_page = response.xpath('//div[@class = "xz-page clearfix"]/a[@class ="up"]')

Or just this in an attempt to grab all children:

next_page = response.xpath('//div[@class = "xz-page clearfix"]/a')

It returns an empty list []

I've tried multiple iterations of the above selector, and each time I get an empty list.

Even just trying to get the tag with this:

next_page = response.xpath('//a[@class = "up"]')

Returns an empty list

I suspect it has something to do with the script tag.

The script inside the script tag is as follows:

<script type="text/javascript">
function createPageHTML(_nPageCount, _nCurrIndex, _sPageName, _sPageExt){
  if(_nPageCount == null || _nPageCount<=1){
    return;
  }

  var nCurrIndex = _nCurrIndex || 0;
  // 1 输出首页和上一页
  // 1.1 当前页是首页
  if(nCurrIndex == 0){
    document.write("<a class=\"pre\" href=\"javascript:void(0)\" style=\"width:40px\">上一页</a>");
    document.write("<a class=\"num sel\" href=\"javascript:void(0)\">1</a>");
  }
  //1.2 当前页不是首页
  else{
    var nPreIndex = nCurrIndex - 1;
    var sPreFileExt = nPreIndex == 0 ? "" : ("_" + nPreIndex);

    document.write("<a class=\"next\" href=\"" + _sPageName + sPreFileExt + "."+_sPageExt+"\" style=\"width:40px\">上一页</a>");
    document.write("<a class=\"num\" href=\""+_sPageName+"."+_sPageExt+"\">1</a>");
  }

  // 2 输出中间分页
  var flag1=true;
  var flag2=true;

  for(var i=1; i<_nPageCount; i++){

    if(i-nCurrIndex<3&&nCurrIndex-i<3||i<2||_nPageCount-i<3){
      if(nCurrIndex == i)
        document.write("<a class=\"num sel\" href=\"javascript:void(0)\">"+(i+1)+"</a>");
      else
        document.write("<a class=\"num\" href=\""+_sPageName+"_" + i + "."+_sPageExt+"\">"+(i+1)+"</a>");
    }
    else if(i-nCurrIndex>3&&flag1){
      flag1=false;
      document.write("<a href=\"javascript:void(0)\" style=\"border:none\">...</a>");
    }
    else if(nCurrIndex-i>3&&flag2){
      flag2=false;
      document.write("<a href=\"javascript:void(0)\" style=\"border:none\">...</a>");
    }
  }

  // 3 输出下一页和尾页
  // 3.1 当前页是尾页
  if(nCurrIndex == (_nPageCount-1)){
    document.write("<a  class=\"up\" href=\"javascript:void(0)\"  style=\"width:40px\">下一页</a>&nbsp;");
    //document.write("<li><a href=\""+_sPageName+"_" + (_nPageCount-1) + "."+_sPageExt+"\">尾页</a></li>");
  }
  // 3.2 当前页不是尾页
  else{
    var nNextIndex = nCurrIndex + 1;
    var sPreFileExt = nPreIndex == 0 ? "" : ("_" + nPreIndex);
    document.write("<a  class=\"up\" href=\""+_sPageName+"_" + nNextIndex + "."+_sPageExt+"\"  style=\"width:40px\">下一页</a>&nbsp;");
    //document.write("<li><a href=\""+_sPageName+"_" + (_nPageCount-1) + "."+_sPageExt+"\">尾页</a></li>");
  }

}

createPageHTML(699, 0, "index", "html");
</script>

I don't know JS so I'm not really sure what this code is doing. I have no problem accessing all other portions of the page I want to scrape, only this one tag is giving me issues. This is only one of the Chinese sites I'm having trouble with, however, the English sources I'm scraping have given me no problems. I'm not sure if the issue is my inexperience in web scraping or if there's something funky that happens with Chinese sites.

For reference, http://www.xizang.gov.cn/xwzx/qnyw/index.html是我要抓取的网页。导航至底部底部的按钮是给我带来麻烦的原因。

任何建议或帮助将不胜感激。谢谢！

无法弄清楚如何“抓取” <a> tag with a class name with Scrapy

0 个答案: