Question

我编写了一个网络抓取工具，该工具可以搜索网站并从该特定网站返回匹配项。将返回许多页面。因此，我可以提取使用循环的所有页面，在该循环中，我已将页码放入工作表/ A列中，以便这些数字（值）用于转到不同的网页。

有没有一种方法可以确定获取了多少页，即确定最后一页？

目前，我猜可能会有多少个页面，这可以工作，但是如果我知道需要循环通过的确切网页数量，则效果会更好

Dim i As Integer
Dim code As String
Dim LastRow As Long
Dim Input1 As String
Dim Input2 As String
Dim URLend As String     


Sheets("PageNumbers").Select

 LastRow = Cells(Rows.Count, 1).End(xlUp).Row

  For i = 2 To LastRow

    code = Range("A" & i).Value               


     ActiveWorkbook.Worksheets.Add
     With ActiveSheet.QueryTables.Add(Connection:= _
    "URL;http://www.blabla/blabla/in-'" & Input1 & "' + '" & 
     Input2 & "/list-" & code _
     & URLend, Destination:=Range("$A$1"))

    .FieldNames = True
    .RowNumbers = False
    .FillAdjacentFormulas = False
    .PreserveFormatting = True        

    End With

 Next i

    Sheets("Update").Select

    i = i + 1    

End Sub

Answer 1

因此，请使用类似以下内容的状态消息添加到状态栏中：

import logging
import pymongo
from scrapy.exceptions import DropItem


class MongoPipeline(object):

collection_name = 'articles'

def __init__(self, mongo_uri, mongo_db):
    self.mongo_uri = mongo_uri
    self.mongo_db = mongo_db
    self.titles_seen = set()

@classmethod
def from_crawler(cls, crawler):
    ## pull in information from settings.py
    return cls(
        mongo_uri=crawler.settings.get('MONGO_URI'),
        mongo_db=crawler.settings.get('MONGO_DATABASE')
    )

def open_spider(self, spider):
    ## initializing spider
    ## opening db connection
    self.client = pymongo.MongoClient(self.mongo_uri)
    self.db = self.client[self.mongo_db]

def close_spider(self, spider):
    ## clean up when spider is closed
    self.client.close()

def process_item(self, item, spider):
    if item['title'] in self.titles_seen:
        raise DropItem("Duplicate item title found: %s" % item)
    else:
        self.titles_seen.add(item['title'])
        return item

确定URL的最后一页值

1 个答案: