使用过滤器在Pyspark中搜索数据框中的实例需要花费太多时间

时间:2017-09-15 13:16:31

标签: search dataframe filter pyspark instance

我有一个带有N个属性的数据框架(Atr1,Atr2,Atr3,...,AtrN)和一个具有相同[1..N-1]属性的单个实例,除了第N个属性。

我想检查DataFrame中是否有任何实例的Attributes [1..N-1]的值相同,如果它存在该实例,我的目标是获取具有Attributes [1..N]的DataFrame中的实例。

例如,如果我有:

Instance:

[Row(Atr1=u'A', Atr2=u'B', Atr3=24)]

Dataframe:

+------+------+------+------+
| Atr1 | Atr2 | Atr3 | Atr4 |
+------+------+------+------+
|  'C' |  'B' |  21  |  'H' |
+------+------+------+------+
|  'D' |  'B' |  21  |  'J' |
+------+------+------+------+
|  'E' |  'B' |  21  |  'K' |
+------+------+------+------+
|  'A' |  'B' |  24  |  'I' |
+------+------+------+------+

我想得到DataFrame的第4行也是Atr4的值。

我尝试使用“filter()”方法,如下所示:

df.filter("Atr1 = 'C' and Atr2 = 'B', and Atr3 = 24").take(1)

我得到了我想要的结果,但花了很多时间。

所以,我的问题是:有没有办法在更短的时间内做同样的事情?

谢谢!

1 个答案:

答案 0 :(得分:1)

您可以使用局部敏感哈希(minhashLSH)查找最近邻居并检查它是否相同。

由于您的数据有字符串,因此您需要在应用LSH之前对其进行处理。 我们将使用pyspark ml的功能模块

以stringIndexing和onehotencoding

开头
Option Explicit

Sub main()
    Dim schema As Object
    Set schema = GetJSON("C:\dev\junk.json")

    Dim thisWB As Workbook
    Dim destSH As Worksheet
    Set thisWB = ThisWorkbook
    Set destSH = thisWB.Sheets("Sheet1")

    Dim anchor As Range
    Set anchor = destSH.Range("A1")

    Dim issues As Collection
    Set issues = schema("issues")

    Dim i As Long
    Dim issue As Variant
    For Each issue In issues
        If issue.Exists("id") Then
            SetCell anchor.Cells(1, 1), issue("fields")("issuetype")("name")
            SetCell anchor.Cells(1, 2), issue("key")
            SetCell anchor.Cells(1, 3), issue("fields")("summary")
            '--- if you're not sure if the "name" field is there,
            '    then remember it's a Dictionary so check with Exists
            If issue("fields")("status").Exists("name") Then
                SetCell anchor.Cells(1, 4), issue("fields")("status")("name")
            Else
                SetCell anchor.Cells(1, 4), vbNullString
            End If
            SetCell anchor.Cells(1, 5), issue("fields")("assignee")
            SetCell anchor.Cells(1, 6), issue("fields")("customfield_13301")
            '--- possibly get the Count and iterate over the exact number of components
            For i = 0 To issue("fields")("components").Count - 1
                SetCell anchor.Cells(1, 7), issue("fields")("components")(i)("name")
            Next i
            SetCell anchor.Cells(1, 9), issue("fields")("customfield_13300")
            SetCell anchor.Cells(1, 10), issue("fields")("customfield_10002")
            Set anchor = anchor.Offset(1, 0)
        End If
    Next issue
End Sub

Function GetJSON(ByVal filename As String) As Object
    '--- first ingest the JSON file and get it parsed
    Dim fso As FileSystemObject
    Dim jsonTS As TextStream
    Dim jsonText As String
    Set fso = New FileSystemObject
    Set jsonTS = fso.OpenTextFile(filename, ForReading)
    jsonText = jsonTS.ReadAll
    Set GetJSON = JsonConverter.ParseJson(jsonText)
End Function

Private Sub SetCell(ByRef thisCell As Range, ByVal thisValue As Variant)
    If IsNull(thisValue) Then
        thisCell = vbNullString
    Else
        thisCell = thisValue
    End If
End Sub

添加ID并组合所有要素向量

df= spark.createDataFrame([('C','B',21,'H'),('D','B',21,'J'),('E','c',21,'K'),('A','B',24,'J')], ["attr1","attr2","attr3","attr4"])


for col_ in ["attr1","attr2","attr4"]:

    stringIndexer = StringIndexer(inputCol=col_, outputCol=col_+"_")
    model = stringIndexer.fit(df)
    df = model.transform(df)
    encoder = OneHotEncoder(inputCol=col_+"_", outputCol="features_"+col_, dropLast = False)
    df = encoder.transform(df)


df = df.drop("attr1","attr2","attr4","attr1_","attr2_","attr4_")
df.show()


+-----+--------------+--------------+--------------+
|attr3|features_attr1|features_attr2|features_attr4|
+-----+--------------+--------------+--------------+
|   21| (4,[2],[1.0])| (2,[0],[1.0])| (3,[1],[1.0])|
|   21| (4,[0],[1.0])| (2,[0],[1.0])| (3,[0],[1.0])|
|   21| (4,[3],[1.0])| (2,[1],[1.0])| (3,[2],[1.0])|
|   24| (4,[1],[1.0])| (2,[0],[1.0])| (3,[0],[1.0])|
+-----+--------------+--------------+--------------+

创建minHashLSH模型并搜索最近邻居

from pyspark.sql.functions import monotonically_increasing_id

df = df.withColumn("id", monotonically_increasing_id())
df.show()


assembler = VectorAssembler(inputCols = ["features_attr1", "features_attr2", "features_attr4", "attr3"]
                            , outputCol = "features")
df_ = assembler.transform(df)
df_ = df_.select("id", "features")
df_.show()


+----------+--------------------+
|        id|            features|
+----------+--------------------+
|         0|(10,[2,4,7,9],[1....|
|         1|(10,[0,4,6,9],[1....|
|8589934592|(10,[3,5,8,9],[1....|
|8589934593|(10,[1,4,6,9],[1....|
+----------+--------------------+

输出

mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345)
model = mh.fit(df_)
model.transform(df_)
key = df_.select("features").collect()[0]["features"]
model.approxNearestNeighbors(df_, key, 1).collect()