带有scrapy

时间:2017-06-20 15:34:58

标签: python json scrapy

这是我的基本scrapy抓取工具:

  def parse(self, response):        
    item = CruiseItem()     

    item['Cruise'] = {}
    item['Cruise']['Cruiseline'] = response.xpath('//title/text()').extract()
    item['Cruise']['Itinerary'] = response.xpath('//*[@id="brochureName1"]/text()').extract()
    item['Cruise']['Price'] = response.xpath('//*[@id="interiorPrice1"]/text()').extract()
    item['Cruise']['PerNight'] = response.xpath('//*[@id="perNightinteriorPrice1"]/text()').extract()

    return item

这非常适合吸引我想要的所有正确元素。例如,我的json feed结果如下:

[

{
    "Cruise": {
        "Cruiseline": [
            "Ship Name"
        ],
        "Itinerary": [
            "3 Night Bahamas ",
            "4 Night Western Caribbean ",
            "4 Night Bahamas ",
            "3 Night Bahamas ",
            "5 Night Western Caribbean ",
            "5 Night Eastern Caribbean ",
            "7 Night Western Caribbean ",
            "7 Night Southern Caribbean ",
            "6 Night Western Caribbean ",
            "7 Night Western Caribbean ",
            "8 Night Eastern Caribbean "
        ],
        "Price": [
            "$169",
            "$179",
            "$289",
            "$349",
            "$359",
            "$389",
            "$389",
            "$409",
            "$424",
            "$524",
            "$939"
        ],
        "PerNight": [
            "$56/night",
            "$45/night",
            "$72/night",
            "$116/night",
            "$72/night",
            "$78/night",
            "$56/night",
            "$58/night",
            "$71/night",
            "$75/night",
            "$117/night"
        ]
    }
}
]

目标json输出不同但是:

[

{
    "Cruise": {
        "Cruiseline": [
            "Ship Name"
        ],
        "Itinerary": [
            "3 Night Bahamas "
        ],
        "Price": [
            "$169"
        ],
        "PerNight": [
            "$56/night"

        ]
    },
    "Cruise": {
        "Cruiseline": [
            "Ship Name"
        ],
        "Itinerary": [
            "4 Night Bahamas "
        ],
        "Price": [
            "$79"
        ],
        "PerNight": [
            "$86/night"
        ]
    }
}
]

基本上我想要返回每艘游轮,每艘船只有1艘,行程,价格和每晚。

这有意义吗?很想讨论

编辑:几天前问过这个,但决定澄清并重新发布。谢谢!

2 个答案:

答案 0 :(得分:0)

尝试使用此脚本重新格式化数据。格式化的数据将存在于updated_list

cruise_list = [

{
    "Cruise": {
        "Cruiseline": [
            "Ship Name"
        ],
        "Itinerary": [
            "3 Night Bahamas ",
            "4 Night Western Caribbean ",
            "4 Night Bahamas ",
            "3 Night Bahamas ",
            "5 Night Western Caribbean ",
            "5 Night Eastern Caribbean ",
            "7 Night Western Caribbean ",
            "7 Night Southern Caribbean ",
            "6 Night Western Caribbean ",
            "7 Night Western Caribbean ",
            "8 Night Eastern Caribbean "
        ],
        "Price": [
            "$169",
            "$179",
            "$289",
            "$349",
            "$359",
            "$389",
            "$389",
            "$409",
            "$424",
            "$524",
            "$939"
        ],
        "PerNight": [
            "$56/night",
            "$45/night",
            "$72/night",
            "$116/night",
            "$72/night",
            "$78/night",
            "$56/night",
            "$58/night",
            "$71/night",
            "$75/night",
            "$117/night"
        ]
    }
}
]

updated_list = []

for cruise_obj in cruise_list:
    cruise_data = cruise_obj['Cruise']
    for i in range(len(cruise_data['Itinerary'])):
        sub_item = {}
        sub_item['Cruise'] = {}
        sub_item['Cruise']['Cruiseline'] = cruise_data['Cruiseline']
        sub_item['Cruise']['Itinerary'] = [cruise_data['Itinerary'][i]]
        sub_item['Cruise']['Price'] = [cruise_data['Price'][i]]
        sub_item['Cruise']['PerNight'] = [cruise_data['PerNight'][i]]
        updated_list.append(sub_item)

其他一些想法

  • 如果您的json中存储的唯一内容是巡航对象,则Cruise的初始密钥是多余的

  • 很多时候,您需要将数据存储在不需要的数组中。我猜这是一个scrapy问题,但你应该尝试修改我的脚本以删除奇异值的数组。例如。对于巡航物体,不应该有多个Cruiseline s。如果您需要帮助,请告诉我。

答案 1 :(得分:0)

想出来。

Public Sub LoadListview()
    Dim IT As Integer
    Dim currentSize As Integer
    Dim arrValues(10) As String

    ConnectionCheck()

    Try
        iPageSize = cboPerPage.Text
        Try
            rsMPCS_Con.Close()
        Catch ex As Exception
        End Try
        Try
            conMPCS2.Close()
        Catch ex As Exception
        End Try

        If conMPCS2.State = 0 Then
            conMPCS2.Open(ConnectionStringToORACLE)
        End If
        rsMPCS_Con.PageSize = iPageSize
        rsMPCS_Con.CacheSize = iPageSize
        rsMPCS_Con.Open(UCase(LastSavedSTSQL), conMPCS2, ADODB.CursorTypeEnum.adOpenStatic, ADODB.LockTypeEnum.adLockReadOnly)

        If Not rsMPCS_Con.EOF Then
            iPageCount = rsMPCS_Con.PageCount

            If iPageCurrent > iPageCount Then iPageCurrent = iPageCount
            If iPageCurrent < 1 Then iPageCurrent = 1

            rsMPCS_Con.AbsolutePage = iPageCurrent
            lblpageofpage.Text = "Page: " & iPageCurrent & " of " & iPageCount
            With ListView1
                .Clear()
                .View = View.Details
                .FullRowSelect = True
                .GridLines = True
                .CheckBoxes = False
                currentSize = 10
                .Font = New Font(.Font.Name, currentSize, .Font.Style, .Font.Unit)
                .Columns.Add("Description", 300, HorizontalAlignment.Left)
                .Columns.Add("Quan.", 50, HorizontalAlignment.Center)
                .Columns.Add("Min. Qty", 70, HorizontalAlignment.Center)
                .Columns.Add("Build No.", 65, HorizontalAlignment.Center)
                .Columns.Add("Rev.", 60, HorizontalAlignment.Center)
                .Columns.Add("Weight/UOM", 110, HorizontalAlignment.Center)
                .Columns.Add("Home Location", 110, HorizontalAlignment.Center)
                .Columns.Add("Sub Location", 90, HorizontalAlignment.Center)
                .Columns.Add("SI_KEY", 0, HorizontalAlignment.Center)
                .Columns.Add("Email Alerts", 90, HorizontalAlignment.Center)
            End With

            iRecordsShown = 0

            Do While iRecordsShown < iPageSize And Not rsMPCS_Con.EOF
                iRecordsShown = iRecordsShown + 1
                arrValues(0) = rsMPCS_Con("description").Value
                arrValues(1) = rsMPCS_Con("StockQuan").Value
                arrValues(2) = rsMPCS_Con("StockMin").Value
                'fixtures
                If rsMPCS_Con("BUILD_NO").Value = 0 Then
                    arrValues(3) = ""
                Else
                    arrValues(3) = rsMPCS_Con("BUILD_NO").Value.ToString
                End If
                If rsMPCS_Con("BUILD_NO").Value = 0 Then
                    arrValues(4) = ""
                Else
                    arrValues(4) = rsMPCS_Con("REVISION").Value.ToString
                End If
                '--------
                arrValues(5) = rsMPCS_Con("Weight").Value & " " & rsMPCS_Con("STOCK_UOM").Value
                arrValues(6) = rsMPCS_Con("LOCATION").Value
                arrValues(7) = rsMPCS_Con("SUB_LOCATION").Value
                arrValues(8) = rsMPCS_Con("SI_KEY").Value
                If rsMPCS_Con("Email").Value.ToString IsNot "" And rsMPCS_Con("StockMin").Value > 0 Then
                    arrValues(9) = "Enabled"
                Else
                    arrValues(9) = ""
                End If

                IT += 1
                Dim lsvi As New ListViewItem(arrValues)
                For Each row In lsvi.SubItems
                    If IT / 2 <> Int(IT / 2) Then
                        row.BackColor = Color.Ivory
                    Else
                        row.BackColor = Color.White
                    End If
                Next
                ListView1.Items.Add(lsvi)
                rsMPCS_Con.MoveNext()
            Loop

            For i As Integer = ListView1.Items.Count - 1 To 0 Step -1
                ListView1.Items(i).UseItemStyleForSubItems = False
                Dim rowreadconsume As OleDbDataReader
                query = "REDACTED"
                Using Conn As New OleDbConnection(ConnectionStringToORACLE)
                    Using CMD As New OleDbCommand()
                        With CMD
                            .Connection = Conn
                            .CommandType = CommandType.Text
                            .CommandText = query
                            .Parameters.AddWithValue("SI_KEY", ListView1.Items(i).SubItems(8).Text)
                        End With
                        Conn.Open()
                        rowreadconsume = CMD.ExecuteReader()
                        rowreadconsume.Read()
                        If rowreadconsume("CONSUMABLE") = 1 Then

                            Dim rowreadquan As OleDbDataReader
                            query = "REDACTED"
                            Using Conn2 As New OleDbConnection(ConnectionStringToORACLE)
                                Using CMD2 As New OleDbCommand()
                                    With CMD2
                                        .Connection = Conn2
                                        .CommandType = CommandType.Text
                                        .CommandText = query
                                        .Parameters.AddWithValue("SI_KEY", ListView1.Items(i).SubItems(8).Text)
                                        .Parameters.AddWithValue("LOCATION", Stockkey)
                                    End With
                                    Conn2.Open()
                                    rowreadquan = CMD2.ExecuteReader()
                                    rowreadquan.Read()
                                    If rowreadquan("QUANTITY_IN_STOCK") <= rowreadquan("LOW_STOCK_QTY") Then
                                        ListView1.Items(i).SubItems(1).ForeColor = Color.Red
                                    Else
                                        ListView1.Items(i).SubItems(1).ForeColor = Color.Black
                                    End If
                                End Using
                            End Using
                        End If
                    End Using
                End Using
            Next               

            rsMPCS_Con.Close()
            'ListView1.Sort()
            'ListView1.Sorting = SortOrder.Descending
            With ListView2
                .Clear()
                .View = View.Details
                .FullRowSelect = True
                .GridLines = True
                .CheckBoxes = False
                currentSize = 8
                .Font = New Font(.Font.Name, currentSize, .Font.Style, .Font.Unit)
                .Columns.Add("Date/Time", 150, HorizontalAlignment.Left)
                .Columns.Add("Employee", 75, HorizontalAlignment.Left)
                .Columns.Add("Transaction Amount", 125, HorizontalAlignment.Center)
                .Columns.Add("Job Number", 100, HorizontalAlignment.Center)
                .Columns.Add("Machine No.", 100, HorizontalAlignment.Center)
                .Columns.Add("Department No.", 100, HorizontalAlignment.Center)
                .Columns.Add("Comments", 400, HorizontalAlignment.Left)
            End With
            PanelLoading.Visible = False
            ListView1.Select()
        Else
            ListView1.Items.Clear()
            ListView2.Items.Clear()
        End If
        If rsMPCS_Con.State = 1 Then rsMPCS_Con.Close()
        If conMPCS2.State = 1 Then conMPCS2.Close()
        cmdNextPage.Enabled = True
        cmdNextPageEnd.Enabled = True
        cmdPrevPage.Enabled = True
        cmdPrevPageEnd.Enabled = True
        If iPageCurrent = iPageCount Then
            cmdNextPage.Enabled = False
            cmdNextPageEnd.Enabled = False
        End If
        If iPageCurrent = 1 Then
            cmdPrevPage.Enabled = False
            cmdPrevPageEnd.Enabled = False
        End If
        CLOSECONNECTION()
    Catch ex As Exception
        MessageBox.Show(ex.ToString)
        If rsMPCS_Con.State = 1 Then rsMPCS_Con.Close()
        If conMPCS2.State = 1 Then conMPCS2.Close()
    End Try

End Sub