我正在准备算法的输入。我的数据结构与实例数据显示相同,但不起作用。我用了。 Printschema(),发现它们是不同的。这是停止运动无法正常工作的主要原因吗?
我不知道为什么StopWordsRemover不起作用
实例数据
代码
data_list=[[['r', 'z', 'h', 'k', 'p']]\
,[['z', 'y', 'x', 'w', 'v', 'u', 't', 's']]\
,[['s', 'x', 'o', 'n', 'r']]\
,[['x', 'z', 'y', 'm', 't', 's', 'q', 'e']]\
,[['z']]\
,[['x', 'z', 'y', 'r', 'q', 't', 'p']]]
data2=spark.createDataFrame(data_list,["items"])
data2.show()
print(data2.dtypes)
data2.printSchema()
print('***********remover-z************')
remover = StopWordsRemover(inputCol='items', outputCol='input', stopWords=['z'])
data_input=remover.transform(data2)
data_input.show()
data_input.printSchema()
打印
+--------------------+
| items|
+--------------------+
| [r, z, h, k, p]|
|[z, y, x, w, v, u...|
| [s, x, o, n, r]|
|[x, z, y, m, t, s...|
| [z]|
|[x, z, y, r, q, t...|
+--------------------+
[('items', 'array<string>')]
root
|-- items: array (nullable = true)
| |-- element: string (containsNull = true)
***********remover-z************
+--------------------+--------------------+
| items| input|
+--------------------+--------------------+
| [r, z, h, k, p]| [r, h, k, p]|
|[z, y, x, w, v, u...|[y, x, w, v, u, t...|
| [s, x, o, n, r]| [s, x, o, n, r]|
|[x, z, y, m, t, s...|[x, y, m, t, s, q...|
| [z]| []|
|[x, z, y, r, q, t...| [x, y, r, q, t, p]|
+--------------------+--------------------+
root
|-- items: array (nullable = true)
| |-- element: string (containsNull = true)
|-- input: array (nullable = true)
| |-- element: string (containsNull = true)
我的数据
代码
data_new.show()
print(data_new.dtypes)
add_stopwords =['o']
remover = StopWordsRemover(inputCol='items', outputCol='input', stopWords=['o'])
data_input=remover.transform(data_new)
data_input.show()
打印
+---------------+---------------+
| items| input|
+---------------+---------------+
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,o,c,o,o,e,g]|[o,o,c,o,o,e,g]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
|[o,b,o,d,d,e,o]|[o,b,o,d,d,e,o]|
+---------------+---------------+
root
|-- items: array (nullable = false)
| |-- element: string (containsNull = false)
预先感谢