import numpy as np
import pandas as pd
import sparkobj as spk
from sklearn.datasets import make_blobs
from sklearn.ensemble import IsolationForest
def train_forest_per_partition_map_step(partition):
print('partition')
print(partition)
get_data = np.asarray(list(partition))
assert get_data.shape[1] == 2
return [IsolationForest(n_estimators=100,
contamination=0.15,
random_state=666).fit(get_data)]
def main():
spark = spk.getsparkobj()
n_samples = 300
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers
rng = np.random.RandomState(666)
data = pd.DataFrame(data=np.concatenate([make_blobs(centers=[[0, 0], columns=["feat_1", "feat_2"]) # skipping some unrelevant
df = spark.createDataFrame(data=data)
df = df.rdd.repartition(numPartitions=3).toDF()
forest = df.rdd.mapPartitions(f=train_forest_per_partition_map_step).collect()
lines = df.rdd.collect().foreach(println)
# Reduce step: Combine scores from partitions.
forest[0].decision_function(data) # Partition 1 Isolation forest.
forest[1].decision_function(data) # Partition 2 Isolation forest.
forest[2].decision_function(data) # Partition 3 Isolation forest.
if __name__ == '__main__':
main()
执行分区后,是否可以通过函数“ train_forest_per_partition_map_step”获取打印结果?我已经尝试过df.rdd.collect()。foreach(println),但是一直出现属性错误
AttributeError: 'list' object has no attribute 'foreach'
AttributeError Traceback (most recent call last)
in engine
1 if __name__ == '__main__':
----> 2 main()
<ipython-input-1-c5cff78d4b35> in main()
25
26 forest = df.rdd.mapPartitions(f=train_forest_per_partition_map_step).collect()
---> 27 lines = df.rdd.take(100).foreach(println)
28
29 # Reduce step: Combine scores from partitions.
AttributeError: 'list' object has no attribute 'foreach'
猜想这仅适用于scala,但想了解Python等效版本
答案 0 :(得分:0)
使用 show,collect,count
方法代替打印。这将使流程在那一刻执行
df.show()
df.filter("your_clause").collect()
df.count()
让我知道这是否是您想要的
答案 1 :(得分:0)
df.rdd.collect().foreach(println)
实际上是标量代码。
df.rdd.collect()
返回列表,您是否尝试过print(df.rdd.collect())
?通常,一个人正在使用df.show()