Question

我正在寻找一种在张量流中使用DBSCAN算法对特征集进行聚类的方法，但我无法找到任何相关内容。

TensorFlow提供K-Means聚类（tf.contrib.learn.KMeansClustering），但我需要DBSCAN算法。

有人可以建议我用python/java写的任何现有包装器吗？

关于如何从头开始实现它的任何指针？

P.S。我知道sklearn和类似的库DBSCAN，但我特别需要TensorFlow。

Answer 1

我知道我晚了一年，但是对于以后的参考： here是我的类DBSCAN算法的实现。例如，它的结果可能与您在sklearn中实现的算法略有不同，特别是对于可能属于多个群集的观测而言。我知道这可能不是最佳选择。我知道，就实现算法而言，TF不是最佳选择。但是也许有人会觉得代码很有价值。

相关代码：

import tensorflow as tf
import numpy as np

def run(vals, epsilon=4, min_points=4):

    def merge_core_points_into_clusters(elems):
        row = elems
        mat = core_points_connection_matrix
        nonempty_intersection_inds = tf.where(tf.reduce_any(tf.logical_and(row, mat), axis=1))
        cumul = tf.logical_or(row, mat)
        subcumul = tf.gather_nd(cumul, nonempty_intersection_inds)
        return tf.reduce_any(subcumul, axis=0)

    def label_clusters(elems):
        return tf.reduce_min(tf.where(elems))

    def get_subsets_for_labels(elems):
        val = elems[0]
        labels = elems[1]
        conn = relation_matrix

        inds = tf.where(tf.equal(labels, val))
        masks = tf.gather_nd(conn, inds)
        return tf.reduce_any(masks, axis=0)

    def scatter_labels(elems):
        label = tf.expand_dims(elems[0], 0)
        mask = elems[1]
        return label*tf.cast(mask, dtype=tf.int64)

    data_np = np.array(vals)

    eps = epsilon
    min_pts = min_points

    in_set = tf.placeholder(tf.float64)

    # distance matrix
    r = tf.reduce_sum(in_set*in_set, 1)
    # turn r into column vector
    r = tf.reshape(r, [-1, 1])
    dist_mat = tf.sqrt(r - 2*tf.matmul(in_set, tf.transpose(in_set)) + tf.transpose(r))

    # for every point show, which points are within eps distance of that point (including that point)
    relation_matrix = dist_mat <= eps

    # number of points within eps-ball for each point
    num_neighbors = tf.reduce_sum(tf.cast(relation_matrix, tf.int64), axis=1)

    # for each point show, whether this point is core point
    core_points_mask = num_neighbors >= min_pts

    # indices of core points
    core_points_indices = tf.where(core_points_mask)

    core_points_connection_matrix = tf.cast(core_points_mask, dtype=tf.int64) * tf.cast(relation_matrix, dtype=tf.int64)
    core_points_connection_matrix = tf.cast(core_points_connection_matrix, dtype=tf.bool)
    core_points_connection_matrix = tf.logical_and(core_points_connection_matrix, core_points_mask)

    merged = tf.map_fn(
        merge_core_points_into_clusters,
        core_points_connection_matrix,
        dtype=tf.bool
    )

    nonempty_clusters_records = tf.gather_nd(merged, core_points_indices)

    marked_core_points = tf.map_fn(label_clusters, nonempty_clusters_records, dtype=tf.int64)

    _, labels_core_points = tf.unique(marked_core_points, out_idx=tf.int64)

    labels_core_points = labels_core_points+1

    unique_labels, _ = tf.unique(labels_core_points)

    labels_all = tf.scatter_nd(
        tf.cast(core_points_indices, tf.int64),
        labels_core_points,
        shape=tf.cast(tf.shape(core_points_mask), tf.int64)
    )

    # for each label return mask, which points should have this label
    ul_shape = tf.shape(unique_labels)
    labels_tiled = tf.maximum(tf.zeros([ul_shape[0], 1], dtype=tf.int64), labels_all)

    labels_subsets = tf.map_fn(
        get_subsets_for_labels,
        (unique_labels, labels_tiled),
        dtype=tf.bool
    )

    final_labels = tf.map_fn(
        scatter_labels,
        elems=(tf.expand_dims(unique_labels, 1), labels_subsets),
        dtype=tf.int64
    )

    final_labels = tf.reduce_max(final_labels, axis=0)

    with tf.Session() as sess:

        results = (sess.run(final_labels, feed_dict={in_set:data_np})).reshape((1, -1))

    results = results.reshape((-1, 1))

    return results

Answer 2

良好的DBSCAN实施需要索引加速以提高性能。由于tensorflowz中没有搜索索引，因此它不是群集的特别好的选择。而是为此目的使用像ELKI这样的专用集群库。

P.S。这是DBSCAN而不是DB-Scan。 N代表“噪音”。

P.P.S。这是一个关于如何编写某些代码的问题的网站。图书馆建议偏离主题。

如何在张量流中实现DBSCAN聚类？

2 个答案: