VBA从网站上刮取数据 - 空数据出错

时间:2016-01-11 15:33:34

标签: excel vba excel-vba web-scraping

我正在创建一个从网站上抓取数据的宏。我遇到的问题是当最后一个整页被抓取而且A列没有数据,但是其他列没有,我收到运行时1004错误。例如,如果要删除的总页数为6,并且列A在第5页上的最后一个条目上没有数据,则宏将在第5页上清除所有数据,但在尝试获取时会抛出运行时错误第6页也有数据,但我认为由于A列中没有数据,因此它只是决定给出运行时错误。有什么想法吗?另外,使用我所包含的代码,在下一个箭头消失之前,是否更容易进行宏循环?如果是这样,我将如何做到这一点?

'Macro to query Daily Activity Search for DFB Counties
'Run Monday to pull data from Friday

Sub queryActivityDailyMforFWorking()

Dim nextrow As Integer, i As Integer
Dim dates
dates = Date - 3

Application.ScreenUpdating = False
Application.DisplayStatusBar = True

Do While i <= 50
    Application.StatusBar = "Processing Page " & i
    nextrow = ActiveSheet.Cells(Rows.Count, "A").End(xlUp).row + 1
    With ActiveSheet.QueryTables.Add(Connection:= _
        "URL;https://www.myfloridalicense.com/delinquency_results.asp?SID=&page=" & i & "&county_1=16&county_1=21&county_1=23&county_1=32&county_1=36&county_1=41&county_1=46&county_1=53&county_1=54&county_1=57&county_1=60&county_1=66&status=R&send_date=" & dates & "&search_1.x=1", _
        Destination:=Range("A" & nextrow))

        '.Name = _
        "2015&search_1.x=40&search_1.y=11&date=on&county_1=AL&lic_num_del=&lic_num_rep=&status=NS&biz_name=&owner_name="
        .FieldNames = False
        .RowNumbers = False
        .FillAdjacentFormulas = False
        .PreserveFormatting = True
        .RefreshOnFileOpen = False
        .BackgroundQuery = True
        .RefreshStyle = xlInsertDeleteCells
        .SavePassword = False
        .SaveData = True
        .AdjustColumnWidth = True
        .RefreshPeriod = 0
        .WebSelectionType = xlSpecifiedTables
        .WebFormatting = xlWebFormattingNone
        .WebTables = "10"
        .WebPreFormattedTextToColumns = True
        .WebConsecutiveDelimitersAsOne = True
        .WebSingleBlockTextImport = False
        .WebDisableDateRecognition = False
        .WebDisableRedirections = False
        .Refresh BackgroundQuery:=False

    'autofit columns
    Columns("A:G").Select
    Selection.EntireColumn.AutoFit

   'check for filter, if not then turn on filter
   ActiveSheet.AutoFilterMode = False
    If Not ActiveSheet.AutoFilterMode Then
    ActiveSheet.Range("A:G").AutoFilter
    End If

i = i + 1
End With
Application.StatusBar = False

'Align text left
Cells.Select
With Selection
    .HorizontalAlignment = xlLeft
    .VerticalAlignment = xlBottom
    .WrapText = False
    .Orientation = 0
    .AddIndent = False
    .IndentLevel = 0
    .ShrinkToFit = False
    .ReadingOrder = xlContext
    .MergeCells = False
End With

Loop

End Sub

1 个答案:

答案 0 :(得分:1)

我无法复制您的错误,但我猜这与您的nextrow变量有关。如果页面上的数据以空单元格结尾,则下一页数据的nextrow值将设置在前一页面的数据中。我认为当您添加另一个查询表然后尝试刷新数据时会导致一些问题,因为表将重叠。如果你知道一个总是有每行数据的那一行,你可以通过获取其他列之一的底行来解决这个问题。我做了一些更新,似乎对我来说效果很好:

  • 添加了错误处理
  • 检查A和B列是否存在底行数据
  • 添加了一些逻辑来检查是否已返回完整页面,如果没有退出循环,那么您不必继续解析空白页面
  • 格式化连接字符串中的日期,因为我发现过去导致问题
  • 如果您不想要它们,则添加了删除标题的选项
  • 将单元格格式化移出循环,使其仅执行一次

希望这有帮助。

Sub queryActivityDailyMforFWorking()
On Error GoTo Err_queryActivityDailyMforFWorking

Const RowsPerPage As Byte = 20
Const DeleteHeader As Boolean = True

Dim nextrow As Integer, maxrow As Integer, i As Integer
Dim dates As Date

dates = Date - 3

Application.ScreenUpdating = False
Application.DisplayStatusBar = True

nextrow = 1
For i = 1 To 50
    Application.StatusBar = "Processing Page " & i
    With ActiveSheet.QueryTables.Add(Connection:= _
        "URL;https://www.myfloridalicense.com/delinquency_results.asp?SID=&page=" & i & "&county_1=16&county_1=21&county_1=23&county_1=32&county_1=36&county_1=41&county_1=46&county_1=53&county_1=54&county_1=57&county_1=60&county_1=66&status=R&send_date=" & Format(dates, "m/d/yyyy") & "&search_1.x=1", _
        Destination:=Range("A" & nextrow))
        '.Name = _
        "2015&search_1.x=40&search_1.y=11&date=on&county_1=AL&lic_num_del=&lic_num_rep=&status=NS&biz_name=&owner_name="
        .FieldNames = False
        .RowNumbers = False
        .FillAdjacentFormulas = False
        .PreserveFormatting = True
        .RefreshOnFileOpen = False
        .BackgroundQuery = True
        .RefreshStyle = xlInsertDeleteCells
        .SavePassword = False
        .SaveData = True
        .AdjustColumnWidth = True
        .RefreshPeriod = 0
        .WebSelectionType = xlSpecifiedTables
        .WebFormatting = xlWebFormattingNone
        .WebTables = "10"
        .WebPreFormattedTextToColumns = True
        .WebConsecutiveDelimitersAsOne = True
        .WebSingleBlockTextImport = False
        .WebDisableDateRecognition = False
        .WebDisableRedirections = False
        .Refresh BackgroundQuery:=False
    End With

    ' Delete the header as required
    If DeleteHeader And i > 1 And ActiveSheet.Cells(nextrow, 1).Value = "License" Then ActiveSheet.Cells(nextrow, 1).EntireRow.Delete

    ' Find the bottom row
    maxrow = Application.WorksheetFunction.Max(ActiveSheet.Cells(Rows.Count, 1).End(xlUp).Row, ActiveSheet.Cells(Rows.Count, 2).End(xlUp).Row)
    ' Stop scraping if a full page wasn't returned
    If (maxrow - nextrow) < (RowsPerPage - IIf(DeleteHeader, 1, 0)) Then
        Exit For
    ' Otherwise set the row for the next page of data
    Else
        nextrow = maxrow + 1
    End If
Next i

Application.StatusBar = "Formatting data"

'autofit columns
ActiveSheet.Columns.EntireColumn.AutoFit

'check for filter, if not then turn on filter
ActiveSheet.AutoFilterMode = False
If Not ActiveSheet.AutoFilterMode Then ActiveSheet.Range("A:G").AutoFilter

'Align text left
With ActiveSheet.Cells
    .HorizontalAlignment = xlLeft
    .VerticalAlignment = xlBottom
    .WrapText = False
    .Orientation = 0
    .AddIndent = False
    .IndentLevel = 0
    .ShrinkToFit = False
    .ReadingOrder = xlContext
    .MergeCells = False
End With

Exit_queryActivityDailyMforFWorking:
    Application.StatusBar = False
    Application.ScreenUpdating = True
    Exit Sub

Err_queryActivityDailyMforFWorking:
    MsgBox Err.Description, vbCritical + vbOKOnly, Err.Number & " - Web Scraping Error"
    Resume Exit_queryActivityDailyMforFWorking

End Sub
相关问题