这是我的基本scrapy抓取工具:
def parse(self, response):
item = CruiseItem()
item['Cruise'] = {}
item['Cruise']['Cruiseline'] = response.xpath('//title/text()').extract()
item['Cruise']['Itinerary'] = response.xpath('//*[@id="brochureName1"]/text()').extract()
item['Cruise']['Price'] = response.xpath('//*[@id="interiorPrice1"]/text()').extract()
item['Cruise']['PerNight'] = response.xpath('//*[@id="perNightinteriorPrice1"]/text()').extract()
return item
这非常适合吸引我想要的所有正确元素。例如,我的json feed结果如下:
[
{
"Cruise": {
"Cruiseline": [
"Ship Name"
],
"Itinerary": [
"3 Night Bahamas ",
"4 Night Western Caribbean ",
"4 Night Bahamas ",
"3 Night Bahamas ",
"5 Night Western Caribbean ",
"5 Night Eastern Caribbean ",
"7 Night Western Caribbean ",
"7 Night Southern Caribbean ",
"6 Night Western Caribbean ",
"7 Night Western Caribbean ",
"8 Night Eastern Caribbean "
],
"Price": [
"$169",
"$179",
"$289",
"$349",
"$359",
"$389",
"$389",
"$409",
"$424",
"$524",
"$939"
],
"PerNight": [
"$56/night",
"$45/night",
"$72/night",
"$116/night",
"$72/night",
"$78/night",
"$56/night",
"$58/night",
"$71/night",
"$75/night",
"$117/night"
]
}
}
]
目标json输出不同但是:
[
{
"Cruise": {
"Cruiseline": [
"Ship Name"
],
"Itinerary": [
"3 Night Bahamas "
],
"Price": [
"$169"
],
"PerNight": [
"$56/night"
]
},
"Cruise": {
"Cruiseline": [
"Ship Name"
],
"Itinerary": [
"4 Night Bahamas "
],
"Price": [
"$79"
],
"PerNight": [
"$86/night"
]
}
}
]
基本上我想要返回每艘游轮,每艘船只有1艘,行程,价格和每晚。
这有意义吗?很想讨论
编辑:几天前问过这个,但决定澄清并重新发布。谢谢!
答案 0 :(得分:0)
尝试使用此脚本重新格式化数据。格式化的数据将存在于updated_list
cruise_list = [
{
"Cruise": {
"Cruiseline": [
"Ship Name"
],
"Itinerary": [
"3 Night Bahamas ",
"4 Night Western Caribbean ",
"4 Night Bahamas ",
"3 Night Bahamas ",
"5 Night Western Caribbean ",
"5 Night Eastern Caribbean ",
"7 Night Western Caribbean ",
"7 Night Southern Caribbean ",
"6 Night Western Caribbean ",
"7 Night Western Caribbean ",
"8 Night Eastern Caribbean "
],
"Price": [
"$169",
"$179",
"$289",
"$349",
"$359",
"$389",
"$389",
"$409",
"$424",
"$524",
"$939"
],
"PerNight": [
"$56/night",
"$45/night",
"$72/night",
"$116/night",
"$72/night",
"$78/night",
"$56/night",
"$58/night",
"$71/night",
"$75/night",
"$117/night"
]
}
}
]
updated_list = []
for cruise_obj in cruise_list:
cruise_data = cruise_obj['Cruise']
for i in range(len(cruise_data['Itinerary'])):
sub_item = {}
sub_item['Cruise'] = {}
sub_item['Cruise']['Cruiseline'] = cruise_data['Cruiseline']
sub_item['Cruise']['Itinerary'] = [cruise_data['Itinerary'][i]]
sub_item['Cruise']['Price'] = [cruise_data['Price'][i]]
sub_item['Cruise']['PerNight'] = [cruise_data['PerNight'][i]]
updated_list.append(sub_item)
其他一些想法
如果您的json中存储的唯一内容是巡航对象,则Cruise
的初始密钥是多余的
很多时候,您需要将数据存储在不需要的数组中。我猜这是一个scrapy问题,但你应该尝试修改我的脚本以删除奇异值的数组。例如。对于巡航物体,不应该有多个Cruiseline
s。如果您需要帮助,请告诉我。
答案 1 :(得分:0)
想出来。
Public Sub LoadListview()
Dim IT As Integer
Dim currentSize As Integer
Dim arrValues(10) As String
ConnectionCheck()
Try
iPageSize = cboPerPage.Text
Try
rsMPCS_Con.Close()
Catch ex As Exception
End Try
Try
conMPCS2.Close()
Catch ex As Exception
End Try
If conMPCS2.State = 0 Then
conMPCS2.Open(ConnectionStringToORACLE)
End If
rsMPCS_Con.PageSize = iPageSize
rsMPCS_Con.CacheSize = iPageSize
rsMPCS_Con.Open(UCase(LastSavedSTSQL), conMPCS2, ADODB.CursorTypeEnum.adOpenStatic, ADODB.LockTypeEnum.adLockReadOnly)
If Not rsMPCS_Con.EOF Then
iPageCount = rsMPCS_Con.PageCount
If iPageCurrent > iPageCount Then iPageCurrent = iPageCount
If iPageCurrent < 1 Then iPageCurrent = 1
rsMPCS_Con.AbsolutePage = iPageCurrent
lblpageofpage.Text = "Page: " & iPageCurrent & " of " & iPageCount
With ListView1
.Clear()
.View = View.Details
.FullRowSelect = True
.GridLines = True
.CheckBoxes = False
currentSize = 10
.Font = New Font(.Font.Name, currentSize, .Font.Style, .Font.Unit)
.Columns.Add("Description", 300, HorizontalAlignment.Left)
.Columns.Add("Quan.", 50, HorizontalAlignment.Center)
.Columns.Add("Min. Qty", 70, HorizontalAlignment.Center)
.Columns.Add("Build No.", 65, HorizontalAlignment.Center)
.Columns.Add("Rev.", 60, HorizontalAlignment.Center)
.Columns.Add("Weight/UOM", 110, HorizontalAlignment.Center)
.Columns.Add("Home Location", 110, HorizontalAlignment.Center)
.Columns.Add("Sub Location", 90, HorizontalAlignment.Center)
.Columns.Add("SI_KEY", 0, HorizontalAlignment.Center)
.Columns.Add("Email Alerts", 90, HorizontalAlignment.Center)
End With
iRecordsShown = 0
Do While iRecordsShown < iPageSize And Not rsMPCS_Con.EOF
iRecordsShown = iRecordsShown + 1
arrValues(0) = rsMPCS_Con("description").Value
arrValues(1) = rsMPCS_Con("StockQuan").Value
arrValues(2) = rsMPCS_Con("StockMin").Value
'fixtures
If rsMPCS_Con("BUILD_NO").Value = 0 Then
arrValues(3) = ""
Else
arrValues(3) = rsMPCS_Con("BUILD_NO").Value.ToString
End If
If rsMPCS_Con("BUILD_NO").Value = 0 Then
arrValues(4) = ""
Else
arrValues(4) = rsMPCS_Con("REVISION").Value.ToString
End If
'--------
arrValues(5) = rsMPCS_Con("Weight").Value & " " & rsMPCS_Con("STOCK_UOM").Value
arrValues(6) = rsMPCS_Con("LOCATION").Value
arrValues(7) = rsMPCS_Con("SUB_LOCATION").Value
arrValues(8) = rsMPCS_Con("SI_KEY").Value
If rsMPCS_Con("Email").Value.ToString IsNot "" And rsMPCS_Con("StockMin").Value > 0 Then
arrValues(9) = "Enabled"
Else
arrValues(9) = ""
End If
IT += 1
Dim lsvi As New ListViewItem(arrValues)
For Each row In lsvi.SubItems
If IT / 2 <> Int(IT / 2) Then
row.BackColor = Color.Ivory
Else
row.BackColor = Color.White
End If
Next
ListView1.Items.Add(lsvi)
rsMPCS_Con.MoveNext()
Loop
For i As Integer = ListView1.Items.Count - 1 To 0 Step -1
ListView1.Items(i).UseItemStyleForSubItems = False
Dim rowreadconsume As OleDbDataReader
query = "REDACTED"
Using Conn As New OleDbConnection(ConnectionStringToORACLE)
Using CMD As New OleDbCommand()
With CMD
.Connection = Conn
.CommandType = CommandType.Text
.CommandText = query
.Parameters.AddWithValue("SI_KEY", ListView1.Items(i).SubItems(8).Text)
End With
Conn.Open()
rowreadconsume = CMD.ExecuteReader()
rowreadconsume.Read()
If rowreadconsume("CONSUMABLE") = 1 Then
Dim rowreadquan As OleDbDataReader
query = "REDACTED"
Using Conn2 As New OleDbConnection(ConnectionStringToORACLE)
Using CMD2 As New OleDbCommand()
With CMD2
.Connection = Conn2
.CommandType = CommandType.Text
.CommandText = query
.Parameters.AddWithValue("SI_KEY", ListView1.Items(i).SubItems(8).Text)
.Parameters.AddWithValue("LOCATION", Stockkey)
End With
Conn2.Open()
rowreadquan = CMD2.ExecuteReader()
rowreadquan.Read()
If rowreadquan("QUANTITY_IN_STOCK") <= rowreadquan("LOW_STOCK_QTY") Then
ListView1.Items(i).SubItems(1).ForeColor = Color.Red
Else
ListView1.Items(i).SubItems(1).ForeColor = Color.Black
End If
End Using
End Using
End If
End Using
End Using
Next
rsMPCS_Con.Close()
'ListView1.Sort()
'ListView1.Sorting = SortOrder.Descending
With ListView2
.Clear()
.View = View.Details
.FullRowSelect = True
.GridLines = True
.CheckBoxes = False
currentSize = 8
.Font = New Font(.Font.Name, currentSize, .Font.Style, .Font.Unit)
.Columns.Add("Date/Time", 150, HorizontalAlignment.Left)
.Columns.Add("Employee", 75, HorizontalAlignment.Left)
.Columns.Add("Transaction Amount", 125, HorizontalAlignment.Center)
.Columns.Add("Job Number", 100, HorizontalAlignment.Center)
.Columns.Add("Machine No.", 100, HorizontalAlignment.Center)
.Columns.Add("Department No.", 100, HorizontalAlignment.Center)
.Columns.Add("Comments", 400, HorizontalAlignment.Left)
End With
PanelLoading.Visible = False
ListView1.Select()
Else
ListView1.Items.Clear()
ListView2.Items.Clear()
End If
If rsMPCS_Con.State = 1 Then rsMPCS_Con.Close()
If conMPCS2.State = 1 Then conMPCS2.Close()
cmdNextPage.Enabled = True
cmdNextPageEnd.Enabled = True
cmdPrevPage.Enabled = True
cmdPrevPageEnd.Enabled = True
If iPageCurrent = iPageCount Then
cmdNextPage.Enabled = False
cmdNextPageEnd.Enabled = False
End If
If iPageCurrent = 1 Then
cmdPrevPage.Enabled = False
cmdPrevPageEnd.Enabled = False
End If
CLOSECONNECTION()
Catch ex As Exception
MessageBox.Show(ex.ToString)
If rsMPCS_Con.State = 1 Then rsMPCS_Con.Close()
If conMPCS2.State = 1 Then conMPCS2.Close()
End Try
End Sub