

;;;
;;; Copyright  David B. Lamkins 1999
;;;
;;; Permission is granted for use, provided that author's name and copyright
;;; are retained in this file. The author makes no warranties with respect to
;;; this work.
;;;
;;; File: levenshtein.lisp
;;; Author: David B. Lamkins <dlamkins@teleport.com>
;;; Organization: NIL
;;; Created: 1999/01/29 00:31:20 (UTC-8)
;;; Implementation: Macintosh Common Lisp Version 4.2
;;; Target: same
;;; Purpose: Calculate Levenshtein distance between two strings.
;;; Keywords: string similarity
;;; Requires:
;;; Resources:
;;; Dependencies:
;;; References: See Description.
;;; Standards:
;;; Unit-Test: See examples at end of file.
;;; Limitations:
;;; To-Do:
;;; Bugs: Code is ugly and conses too much.
;;;
;;; Description:
;;;
;;; Compute the Levenshtein distance between a pair of strings.
;;;
;;; I no longer have the article in which I found this algorithm,
;;; but it appears to find the minimal-cost sequence of edits that
;;; can be applied to both strings to make them identical.  In
;;; other words, this algorithm does not assume that one string
;;; is "correct". Edits can be any of match (no change), insert,
;;; delete, or substitute. Each kind of edit has an associated
;;; cost.
;;;
;;; Despite the fact that neither string is presumed "authoritative",
;;; the Levenshtein distance metric is surprisingly useful for
;;; qualifying fuzzy matches from a dictionary; see the DWIM lookup
;;; examples at the end of the file.
;;;
;;; This code was translated from a THINK Pascal adaptation of a
;;; C algorithm I found in the article "Finding String Distances",
;;; Ray Valds, Dr. Dobbs Journal, April 1992, ppg. 56< 62, 107.
;;;
;;; This was a "reasonably" straightforward translation from the
;;; THINK Pascal source, which was itself somewhat obfuscated by
;;; a lot of idioms for managing dynamic storage on the Mac heap.
;;; I could probably make the code more readable if I built a Lisp
;;; version from scratch, but that would mean that I'd have to
;;; dig up the original references, and I don't have the inclination
;;; to do that just now.
;;;
;;; Algorithm due to V.I. Levenshtein, as presented in "Time Warps,
;;; String Edits, and MacroMolecules: The Theory and Practice of
;;; Sequence Comparison", Sankoff and Kruskal, eds., AddisonWesley,
;;; 1983.


#+cltl2(in-nuprl-package)
#-cltl2(in-package *nuprl-package-name*)


(defstruct (lev-op (:print-function lev-op-printer))
  index1
  index2
  op)

(defun lev-op-printer (object stream depth)
  (declare (ignore depth))
  (print-unreadable-object (object stream :type t)
    (format stream "~D ~D ~A"
            (lev-op-index1 object)
            (lev-op-index2 object)
            (lev-op-op object))))

(defstruct matrix-cell
  distance
  op)

(defstruct move
  row-dist
  col-dist)

(defvar *levenshtein-matrix* nil)
;;(defvar *lin-levenshtein-matrix* (make-array (* 30 30)
;;					     :displaced-to *levenshtein-matrix*))
(defstruct levenshtein-matrix
  max-rows
  max-cols
  rows
  cols
  array)

(defun levenshtein-matrix-init (m r c)
  (let ((mm (if (and m
		     (<= r (levenshtein-matrix-max-rows m))
		     (<= c (levenshtein-matrix-max-cols m)))
		(progn
		  (setf (levenshtein-matrix-rows m) r
			(levenshtein-matrix-cols m) c)
		  m)
		(make-levenshtein-matrix :array (make-array (* r c))
					 :max-rows r
					 :max-cols c
					 :rows r
					 :cols c))))
    
    (let ((a (levenshtein-matrix-array mm)))

      (dotimes (j (1- c))
	(let ((i (1+ j)))
	  (let ((mc (aref a i)))
	    (if mc
		(setf (matrix-cell-distance mc) i
		      (matrix-cell-op mc) 'insert)
		(setf (aref a i)
		      (make-matrix-cell :distance i
					:op 'insert))))))	    
      (dotimes (j r)
	(let ((i (* j c)))
	  ;;(format t "~a " i)
	  (let ((mc (aref a i)))
	    (if mc
		(setf (matrix-cell-distance mc) j
		      (matrix-cell-op mc) 'delete)
		(setf (aref a i)
		      (make-matrix-cell :distance j
					:op 'delete))))))
      mm)))

(defun levenshtein-matrix-llookup (m i)
  (aref (levenshtein-matrix-array m) i))

(defun levenshtein-matrix-lookup (m r c)
  (levenshtein-matrix-llookup m (+ c (* r (levenshtein-matrix-cols m)))))

(defun levenshtein-matrix-lset-cell  (m i d op)
  (let ((a (levenshtein-matrix-array m)))
    (let ((mc (aref a i)))
      (if mc
	  (setf (matrix-cell-distance mc) d
		(matrix-cell-op mc) op)
	  (setf (aref a i)
		(make-matrix-cell :distance d
				  :op op))))))

(defun levenshtein-matrix-set-cell (m r c d op)
  (levenshtein-matrix-lset-cell m (+ c (* r (levenshtein-matrix-cols m))) d op))

(let ((moves (list
              (cons 'match (make-move :row-dist -1
                                      :col-dist -1))
              (cons 'insert (make-move :row-dist 0
                                       :col-dist -1))
              (cons 'delete (make-move :row-dist -1
                                       :col-dist 0))
              (cons 'substitute (make-move :row-dist -1
                                           :col-dist -1)))))
  (defun orig-levenshtein-string-distance (string-1 string-2
                                               &key
                                               (match-cost 0)
                                               (insert-cost 1)
                                               (delete-cost 1)
                                               (substitute-cost 2))
    "Return the Levenshtein distance, number of edits, and the
edit sequence between STRING-1 and STRING-2. Edit costs can be
specified via :MATCH-COST, :INSERT-COST, :DELETE-COST and
:SUBSTITUTE-COST keyword arguments."
    (declare (optimize (speed 3)))
    (let* ((num-rows (1+ (length string-1)))
           (num-cols (1+ (length string-2)))
           (matrix (make-array (list num-rows num-cols))))
      ;; initialize matrix
      (do ((i 1 (1+ i)))
          ((= i num-cols))
        (setf (aref matrix 0 i)
              (make-matrix-cell :distance i
                                :op 'insert)))
      (do ((i 0 (1+ i)))
          ((= i num-rows))
        (setf (aref matrix i 0)
              (make-matrix-cell :distance i
                                :op 'delete)))
      ;; calculate matrix
      (let ((linearized-matrix (make-array (* num-rows num-cols)
                                           :displaced-to matrix))
            (c (1+ num-cols))
            (n 1)
            (w num-cols)
            (nw 0)
            (op-costs `((match . ,match-cost)
                        (insert . ,insert-cost)
                        (delete . ,delete-cost)
                        (substitute . ,substitute-cost))))
        (labels ((advance-indices ()
                   (incf c)
                   (incf n)
                   (incf w)
                   (incf nw))
                 (op-cost (op)
                   (cdr (assoc op op-costs)))
                 (distance (index)
                   (matrix-cell-distance (aref linearized-matrix index)))
                 (set-c-entry (ref-index op)
                   (setf (aref linearized-matrix c)
                         (make-matrix-cell :distance (+ (distance ref-index)
                                                        (op-cost op))
                                           :op op))))
          (do ((row 0 (1+ row)))
              ((= row (1- num-rows)))
            (do ((col 0 (1+ col)))
                ((= col (1- num-cols)))
              ;; calculate cell
              (cond ((< (distance w) (distance n))
                     (cond ((< (distance w) (distance nw))
                            (set-c-entry w 'insert))
                           ((char= (char string-1 row) (char string-2 col))
                            (set-c-entry nw 'match))
                           (t
                            (set-c-entry nw 'substitute))))
                    ((< (distance n) (distance nw))
                     (set-c-entry n 'delete))
                    ((char= (char string-1 row) (char string-2 col))
                     (set-c-entry nw 'match))
                    (t
                     (set-c-entry nw 'substitute)))
              ;; advance indices - inner
              (advance-indices))
            ;; advance indices - outer
            (advance-indices))))
      ;; backtrack matrix
      (let ((distance (matrix-cell-distance
                       (aref matrix (1- num-rows) (1- num-cols))))
            (row (1- num-rows))
            (col (1- num-cols))
            (edit-count 0)
            (ops ()))
        (loop
	 (unless (and (not (zerop row))
		      (not (zerop col)))
	   (return-from orig-levenshtein-string-distance
	     (values distance edit-count (nreverse ops))))
	 (let* ((which-op (matrix-cell-op (aref matrix row col)))
		(move (cdr (assoc which-op moves))))
	   (unless (eq which-op 'match)
	     (incf edit-count)
	     (push (make-lev-op :index1 (1- row)
				:index2 (1- col)
				:op which-op)
		   ops))
	   (incf row (move-row-dist move))
	   (incf col (move-col-dist move))))))))

(let ((moves (list
              (cons 'match (make-move :row-dist -1
                                      :col-dist -1))
              (cons 'insert (make-move :row-dist 0
                                       :col-dist -1))
              (cons 'delete (make-move :row-dist -1
                                       :col-dist 0))
              (cons 'substitute (make-move :row-dist -1
                                           :col-dist -1)))))

  
  (defun levenshtein-string-distance (string-1 string-2 max-edits
                                               &key
                                               (match-cost 0)
                                               (insert-cost 1)
                                               (delete-cost 1)
                                               (substitute-cost 2))
    "Return the Levenshtein distance, number of edits, and the
edit sequence between STRING-1 and STRING-2. Edit costs can be
specified via :MATCH-COST, :INSERT-COST, :DELETE-COST and
:SUBSTITUTE-COST keyword arguments."
    (declare (optimize (speed 3)))

    (let* ((num-rows (1+ (length string-1)))
           (num-cols (1+ (length string-2))))

      ;;(when (> num-rows (array-dimension *LEVENSHTEIN-MATRIX* 0))
      ;;(setf *LEVENSHTEIN-MATRIX* (make-array (list num-rows num-cols))))

      (setf *levenshtein-matrix* (levenshtein-matrix-init *levenshtein-matrix* num-rows num-cols))

      ;; calculate matrix
      (let ((c (1+ num-cols))
	    (n 1)
	    (w num-cols)
	    (nw 0)
	    (op-costs `((match . ,match-cost)
			(insert . ,insert-cost)
			(delete . ,delete-cost)
			(substitute . ,substitute-cost))))

	(labels ((advance-indices ()
		   (incf c)
		   (incf n)
		   (incf w)
		   (incf nw))
		 (op-cost (op)
		   (cdr (assoc op op-costs)))
		 (distance (index)
		   (matrix-cell-distance (levenshtein-matrix-llookup *levenshtein-matrix* index)))
		 (set-c-entry (ref-index op)
		   (levenshtein-matrix-lset-cell *levenshtein-matrix* c
						 (+ (distance ref-index) (op-cost op))
						 op)))
	  (do ((row 0 (1+ row)))
	      ((= row (1- num-rows)))
	    (do ((col 0 (1+ col)))
		((= col (1- num-cols)))
	      ;; calculate cell
	      (cond ((< (distance w) (distance n))
		     (cond ((< (distance w) (distance nw))
			    (set-c-entry w 'insert))
			   ((char= (char string-1 row) (char string-2 col))
			    (set-c-entry nw 'match))
			   (t
			    (set-c-entry nw 'substitute))))
		    ((< (distance n) (distance nw))
		     (set-c-entry n 'delete))
		    ((char= (char string-1 row) (char string-2 col))
		     (set-c-entry nw 'match))
		    (t
		     (set-c-entry nw 'substitute)))
	      ;; advance indices - inner
	      (advance-indices))
	    ;; advance indices - outer
	    (advance-indices))))
 
	;; backtrack matrix
	(let ((distance (matrix-cell-distance (levenshtein-matrix-lookup *levenshtein-matrix* (1- num-rows) (1- num-cols))))
	      (row (1- num-rows))
	      (col (1- num-cols))
	      (edit-count 0)
	      (ops ()))
	  (loop
	   (when (and  max-edits (> edit-count max-edits))
	     (return-from levenshtein-string-distance edit-count))
	   (unless (and (not (zerop row))
			(not (zerop col)))
	     (return-from levenshtein-string-distance
	       (values distance edit-count (nreverse ops))))
	   (let* ((which-op (matrix-cell-op (levenshtein-matrix-lookup *levenshtein-matrix* row col)))
		  (move (cdr (assoc which-op moves))))
	     (unless (eq which-op 'match)
	       (incf edit-count)
	       (push (make-lev-op :index1 (1- row)
				  :index2 (1- col)
				  :op which-op)
		     ops))
	     (incf row (move-row-dist move))
	     (incf col (move-col-dist move))))))))))

#|
;; Examples

? (levenshtein-string-distance "which" "witch")
2
2
(#<LEV-OP 2 2 INSERT> #<LEV-OP 1 0 DELETE>)
? (levenshtein-string-distance "wchih" "which")
2
2
(#<LEV-OP 3 3 INSERT> #<LEV-OP 1 0 DELETE>)
? (levenshtein-string-distance "foobar" "fubar")
3
2
(#<LEV-OP 2 1 SUBSTITUTE> #<LEV-OP 1 0 DELETE>)
? (levenshtein-string-distance "document" "documet")
1
1
(#<LEV-OP 6 5 DELETE>)
? (levenshtein-string-distance "caliber" "calibre")
2
2
(#<LEV-OP 6 6 DELETE> #<LEV-OP 4 5 INSERT>)
|#

#|
;;; Example code for DWIM lookup

;;; Here are a couple of functions to lookup fuzzy matches for a
;;; given symbol or string, printing a list of candidates from
;;; either the internal and external symbols of *PACKAGE*, or from
;;; just the external symbols of the COMMON-LISP package.  With
;;; suitable (nonportable) hooks and minor modifications to return
;;; lists instead of printing results, these functions could form
;;; the basis of a DWIM facility.

;;; An iMac running MCL 4.2 will compute distances at rates ranging
;;; from approximately 2,000 symbols per second (for short symbols)
;;; to 500 symbols per second (for longer symbols).  These rates
;;; are fast enough for a DWIM lookup if you restrict lookup to
;;; about 1,000 symbols, as in DWIM-LOOKUP-EXTERNAL-CL-SYMBOLS.
;;; But my CL-USER package contains about 5,000 internal and external
;;; symbols, so more finesse is required to get acceptable performance
;;; in DWIM-LOOKUP-SYMBOLS; the total time is reduced by requiring that
;;; the difference in lengths between the two symbols is below some
;;; threshold before calling LEVENSHTEIN-STRING-DISTANCE.  A greater
;;; mismatch in string lengths would result in large distances anyway,
;;; so it makes sense to avoid the expensive test when the cheap test
;;; can rule out a potential candidate.

(defparameter *dwim-lookup-cutoff* 5
  "All DWIM matches will have a Levenshtein distance less than this value.")
(defparameter *dwim-length-cutoff* 3
  "Max difference in symbol lengths when searching *PACKAGE* for DWIM
matches. Not used when searching COMMON-LISP package.")

(defun dwim-lookup-symbols (symbol-or-string)
  (let ((symbol-string (string-upcase (string symbol-or-string)))
        (count 0)
        (candidates 0))
    (do-symbols (sym)
      (incf count)
      (let ((candidate-string (string sym)))
        (when (and (<= (abs (- (length candidate-string)
                               (length symbol-string)))
                      *dwim-length-cutoff*)
                   (incf candidates)
                   (< (levenshtein-string-distance symbol-string (string
sym))
                      *dwim-lookup-cutoff*))
          (print sym))))
    (format t "~&In ~:D symbols, examined ~:D candidates having lengths +/-
~D.~%"
            count candidates *dwim-length-cutoff*))
  (values))

(defun dwim-lookup-external-cl-symbols (symbol-or-string)
  (let ((symbol-string (string-upcase (string symbol-or-string)))
        (count 0))
    (do-external-symbols (sym :cl)
      (incf count)
      (when (< (levenshtein-string-distance symbol-string (string sym))
               *dwim-lookup-cutoff*)
        (print sym)))
    (format t "~&Examined ~:D symbols.~%" count))
  (values))
|#

#|
;;; Examples for DWIM lookup

? (dwim-lookup-symbols "levelstein-stng-distance")

LEVENSHTEIN-STRING-DISTANCE
In 5,241 symbols, examined 534 candidates with lengths +/- 3.
? (dwim-lookup-external-cl-symbols "do-internal-symbols")

DO-EXTERNAL-SYMBOLS
Examined 980 symbols.
? (dwim-lookup-external-cl-symbols "package-used-list")

PACKAGE-USED-BY-LIST
PACKAGE-USE-LIST
Examined 980 symbols.
|#
