Check out this Cluster Analysis WhizzML script "Find neighbors"

Find neighbors | BigML.com

FREE

Find neighbors

Details:

Created	Wed, 27 Jul 2016 07:03:50 +0000
Published	Wed, 27 Jul 2016 11:10:44 +0000

Description:

This script takes as inputs a cluster identifier, an instance, i.e., a map with values for all fields used by the cluster, and a positive count n. It then:

Finds the centroid in the cluster closer to the given instance p
Selects within that centroid's dataset the n instances that are closest to p
If there are less than n rows in the centroid's dataset, missing instances are read from the next closest centroid.

This workflow uses flatline to compute the distance between p and the centroid datasets (via the row-distance-squared flatline function) and add an extra column to the dataset, and then creates a sample of the result, ordered by the computed distance.

The input instance can be specified using either field identifiers or field names.

Tags:

Clusters Nearest neighbor

URL:

https://bigml.com/user/whizzml/gallery/script/57985cd6af447f1f4e001668

Script FREE

Source code

;; Given a cluster and a point, find the closet n
;; rows in the cluster's dataset(s).

;; Helper function that returns the list of effective (no-summary)
;; fields in a cluster
(define (cluster-fields cluster)
  (let (m (cluster ["clusters" "fields"] {}))
    (iterate (r {} k (keys (cluster "scales")))
      (if (contains? m k) (assoc r k (m k)) r))))

;; Given a cluster and an instance with the same fields as its dataset,
;; order the cluster's centroids by increasing distance to the instance.
(define (ordered-centroids cluster instance)
  (let (descs (cluster-fields cluster)
        weights (cluster "scales" [])
        cs (map (lambda (cn)
                  (let (c (cn "center")
                        d (handle (lambda (e)
                                    (log-error "Could not compute distance "
                                               "between " c " and " instance)
                                    (log-error e))
                            (row-distance-squared descs weights c instance)))
                    (assoc c "distance" (or d -1) "id" (cn "id"))))
                (cluster ["clusters" "clusters"]))
        cs (filter (lambda (c) (positive? (c "distance" 0))) cs))
    (sort-by-key "distance" cs)))

;; Auxiliary function for error signaling.
(define (raise-missing id)
  (raise {"message" (str "Missing input field: " id) "code" -1}))

;; Auxiliary function: constructs the flatline string that generates a
;; new field with the distance of each row to the given one.
(define (distance-flatline cluster instance)
  (let (ids (keys (cluster-fields cluster))
        ps (map (lambda (id) (or (instance id false) (raise-missing id))) ids)
        scales (cluster "scales" {})
        ws (map (lambda (id) (scales id 1)) ids))
    (flatline "(row-distance-squared (list @{{ps}})"
              "                      (fields @{{ids}})"
              "                      (list @{{ws}}))")))

;; Given a cluster and one of its centroids, uses the flatline
;; string generated by `distance-flatline` to create a new
;; dataset that extend's the centroid dataset with a distance
;; column.
(define (generate-distance-dataset cluster cent fl)
  (let (cluster-id (cluster "resource")
        id (or (cent "id" false) (raise (str "No id in " cent)))
        ds-id (cluster ["cluster_datasets" id] false)
        ds-id (if (or (not ds-id) (empty? ds-id))
                  (create-and-wait-dataset {"cluster" cluster-id "centroid" id})
                  (str "dataset/" ds-id)))
    (create-and-wait-dataset {"origin_dataset" ds-id
                              "new_fields" [{"name" "distance" "field" fl}]})))

;; Given an extended centroid dataset (created by
;; `generate-distance-dataset`), returns the list of `n` rows with the
;; smallest values in the distance column.
(define (fetch-dataset-instances ds-id n)
  (let (sample-id (create-and-wait-sample {"dataset" ds-id})
        obj-id (dataset-get-objective-id ds-id)
        sample (fetch sample-id {"row_order_by" obj-id
                                 "rows" n
                                 "mode" "linear"})
        rows (sample ["sample" "rows"] []))
    (delete sample-id)
    rows))

;; Final workflow.
(define (find-neighbors cluster-id instance n)
  (let (cluster (fetch cluster-id)
        fl (distance-flatline cluster instance))
    (iterate (r [] cp (ordered-centroids cluster instance))
      (if (<= n (count r))
          (break r)
          (let (ds-id (generate-distance-dataset cluster cp fl)
                m (- n (count r)))
            (concat r (fetch-dataset-instances ds-id m)))))))

;; Utility functions to allow input instance be specified using field
;; names or ids.
(define (translate-instance cluster-id instance)
  (let (fields ((fetch cluster-id) ["clusters" "fields"])
        find-id (lambda (name)
                  (loop (ks (keys fields))
                    (when (empty? ks) (raise-missing name))
                    (cond (= name (fields [(head ks) "name"] false)) (head ks)
                          (= name (head ks)) name
                          (recur (tail ks))))))
    (make-map (map find-id (keys instance)) (values instance))))

;; Inputs and outputs
(define rows
  (find-neighbors cluster-id (translate-instance cluster-id instance) n))

Description

Inputs

Outputs

License: Creative Commons Zero V1.0
Script FREE

Find neighbors | BigML.com

Sending Request...

COMPANY

PRODUCT

BUSINESS

TRAINING

GALLERY

License

Embed this Script in your web site

COMPANY

PRODUCT

BUSINESS

TRAINING

GALLERY