
;;;************************************************************************
;;;                                                                       *
;;;    Copyrighted Cornell University 2003                                *
;;;                                                                       *
;;;                                                                       *
;;;                Formal Digital Library System                          *
;;;                ------------------------------                         *
;;;                                                                       *
;;;   Developed by the FDL group, Department of Computer Science,         *
;;;   Cornell University, Ithaca NY.  See the release notes for a list    *
;;;   of the members of the group.                                        *
;;;                                                                       *
;;;   Permission is granted to use and modify FDL provided this notice    *
;;;   is retained in derived works.                                       *
;;;                                                                       *
;;;                                                                       *
;;;************************************************************************

#+cltl2(in-system-package)
#-cltl2(in-package *system-package-name*)

#-dontinline
(eval-when (compile)
  (proclaim '(inline
	      ml-next ml-eofp ml-line-count
	      llterpri llprins llprinpmsg llprin1
	      )))


;;;;	RLE TODO : update ml doc about escapes and what they mean, alsoe unicode escapes
;;;;	RLE TODO : and also removal of digit escapes ???

;;;;	
;;;;	Escapes in tokens and strings to allow insertion of Whitespace:
;;;;	
;;;;	`n` `N` : newline
;;;;	`l` `L` : linefeed
;;;;	`r` `R` : return
;;;;	`s` `S` : space
;;;;	`t` `T` : tab
;;;;	
;;;;	  * Scanner maps escape sequences to proper characters.
;;;;	  * newline, linefeed, and return share same internal representation.
;;;;
;;;;	If not 16bit-chars then embedded unicode is collected into a single
;;;;	token.  It's somewhat quixotic to collect embedded unicode chars here as
;;;;	they will only be imploded to form a larger token. However, it is more
;;;;	consistent and precludes any possibility of the caller misinterpreting
;;;;	the digit sequence.
;;;;	
;;;;	Input will be integers which need to be mapped to tokens.  Tokens will be
;;;;	single character strings or if not 16bit-chars they will be strings
;;;;	representing the embedded unicode character if not a standard character.
;;;;	Integers may be any 16bit value (16bit text io) and may also contain embedded
;;;;	characters (file io) and 8bit non standard characters (file io).

;;; Following contains a gross hack.  There was a problem with reading ML
;;; files: if there were no characters after the last ";;", then the
;;; last ML form would not be processed (its processing was interrupted
;;; by eof). Easier to kludge then to fix, hence the hack: have scanner return
;;; a space before eof.

(defvar ml-cr (implode-to-character-tok ireturn))
(defvar ml-nl (implode-to-character-tok inewline)) ; carriage return
(defvar ml-lf (implode-to-character-tok inewline)) ; newline
(defvar ml-tab (implode-to-character-tok itab)) ; line feed
(defvar ml-space (implode-to-character-tok ispace))


(defun make-ml-scanner (next eoff addrf)
  (let ((eofp nil)
	(escape-p nil))
    (list*
     #'(lambda (active-escape-p)
	 (setf escape-p nil)
	 (when (eql eofp 'done)
	   (raise-error (error-message 'IO "NEXT: ML Scanner EOF.")))
	 (if eofp
	     ml-space
	     ;; next returns code not char!!
	     (let ((code (funcall next)))
	       ;;(setf a code b active-escape-p) (break "ms")
	       (if (not (integerp code))
		   ;; RLE ??? whats about 16bit unicode chars???
		   code			; ie, a term.
		   (if (and active-escape-p (= iescape code))
		       (progn
			 (when (funcall eoff)
			   (raise-error (error-message '(ML scan escape eof))))
			 (let ((ncode (funcall next)))
			   (case ncode
			     ((#.(character-to-code #\n) #.(character-to-code #\N)) ml-nl)
			     ((#.(character-to-code #\l) #.(character-to-code #\L)) ml-lf)
			     ((#.(character-to-code #\r) #.(character-to-code #\R)) ml-cr)
			     ((#.(character-to-code #\s) #.(character-to-code #\S)) ml-space)
			     ((#.(character-to-code #\t) #.(character-to-code #\T)) ml-tab)

			     (otherwise
			      (if (hex-code-p ncode)
				  ;; convert embedded unicode sequence to token.
				  (let ((unicode (ash (hex-code-to-int ncode) 12)))
				    ;;(setf a ncode b unicode)
				    (dotimes (i 3)
				      (when (funcall eoff)
					(raise-error
					 (error-message '(ML scan escape unicode eof)
							(1+ i))))
				      
				      (let ((code (funcall next)))
					(if (hex-code-p code)
					    (setf unicode (+ unicode
							     (ash (hex-code-to-int code) (* 4 (- 2 i)))))
					    (raise-error
					     (error-message '(ML scan escape unicode hex-digit not)
							    (1+ i) code)))))
				    (intern-system (int-to-character-string unicode)))

				  (progn
				    (setf escape-p t)
				    (implode-to-character-tok ncode)))))))

		       (implode-to-character-tok code))))))

     #'(lambda ()
	 (when (eql eofp 'done)
	       (raise-error (error-message 'IO "EOFP: ML Scanner EOF.")))
	 (if eofp
	     (progn (setf eofp 'done) t)
	   (progn (setf eofp (funcall eoff)) nil)))

     #'(lambda ()
	 (funcall addrf))

     #'(lambda () escape-p))))




(defvar *ml-scanner* nil)


(defvar *ml-scanner-history-p* nil)
(defvar *ml-scanner-history* nil)

(defmacro with-ml-file-scanner ((filespec) &body body)
  `(with-prl-open-file (stream ,filespec in)
    (let ((*ml-scanner* (multiple-value-call #'make-ml-scanner (make-file-scanner stream))))
      ,@body)))

(defmacro with-ml-scanner ((next eofp addrf) &body body)
  `(let ((*ml-scanner* (make-ml-scanner ,next ,eofp ,addrf)))
    (setf *ml-scanner-history* nil)
    ,@body))


(defun ml-scanner-history ()
  (do ((acc nil)
       (sacc nil)
       (i 0 (1+ i))
       (l (reverse *ml-scanner-history*) (cdr l)))
      ((null l)
       (when sacc
	    (push (implode-toks-to-string (nreverse sacc)) acc)
	    (setf sacc nil))
       (nreverse acc))
    ;;(setf a (car l)) (break)
    (if (symbolp (car l))
	(push (car l) sacc)
	(progn
	  (when sacc
	    (push (implode-toks-to-string (nreverse sacc)) acc)
	    (setf sacc nil))
	  (push (car l) acc)))))


(defun ml-next (&optional active-escape-p)
 (funcall (car *ml-scanner*) active-escape-p))

(defun ml-eofp ()
  (funcall (cadr *ml-scanner*)))
  
(defun ml-line-count ()
  (funcall (caddr *ml-scanner*)))

(defun ml-escape-p ()
  (funcall (cdddr *ml-scanner*)))
  

(defun nextch (&optional active-escape-p symbolp)
  (with-system-error ('(ml nextch))
    (when (ml-eofp) (breakout eoftag t))
    (let ((next (ml-next active-escape-p)))
      ;;(format t " ~a" next)
      (when *ml-scanner-history-p*
	(push next *ml-scanner-history*))
      (when symbolp
	(unless (symbolp next)
	  (raise-error (error-message '(scan) "ML Scanner not character."))))
      next)))


;;; output

(defvar output-list)

(defun llterpri ()
  (push #\newline output-list))

(defun llcutbreak ()
  (push #\return output-list))

;; s is symbol or string.
(defun llprins (s)
  (push s output-list))

(defun llprinpmsg (pmsg)
  (push pmsg output-list))

(defun llprinc (expr)
  (let ((string (princ-object-to-string expr t)))
    (push string output-list)))

(defun llprint (expr)
  (let ((string (print-object-to-string expr t)))
    (push string output-list)))

;-- prin1 is the same as print.
(defun llprin1 (expr)  
  (llprint expr))  ;llprin1

;;;
;;; ML stuff primarily:
;;;
;;;;	implode-toks(tok-list) : returns token.
;;;;	implode-toks-to-string(tok-list) : returns string.
;;;;	 ** toks in tok-list must be single character tokens.
;;;;
;;;;	concat(&rest tok-list) : returns token.
;;;;

;; expects symbol, string ok too.

;;;;	RLE PERF : do array lookup of commonly escaped chars instead of string intern.
;;;;	RLE ??? can there be escapes in tokens other than unicode? eg \s, look at
;;;;	RLE ??? scanner to see if passed thru.

(defun explode (x)
  (let* ((s (string x))
	 (l (length s)))
    (if (= 1 l)
	(list (character-to-character-tok (char s 0)))
	(let ((acc nil))
	  (dotimes (i l)
	    (let ((ch (char s i)))
	      (push (if (char= #\\ ch)
			#+16bit-chars
		        (character-to-character-tok  h)
			#-16bit-chars
			(if (and (> (- l i) 4)
				 (hex-char-p (char s (+ i 1)))
				 (hex-char-p (char s (+ i 2)))
				 (hex-char-p (char s (+ i 3)))
				 (hex-char-p (char s (+ i 4))))
			    (prog1
			      (intern-system (subseq s i (+ i 5)))
			      (setf i (+ i 4)))
			    (character-to-character-tok  ch))
			(character-to-character-tok ch))
		    acc)))

	  (nreverse acc)))))


(defun implode-toks-to-string (l)
  (let ((len 0))
    (dolist (m l)
	    (incf len (length (string m))))
    (let ((s (make-string len))
	  (i 0))

      (dolist (m l)
        (let ((str (string m)))
	  (dotimes (j (length str))
	    (setf (char s i) (char str j))
	    (incf i))))
      s)))

(defun implode-toks (l)
  (cond
    ((null l)	'||)			; do not expect null list.
    ;;(cerror "continue results in ml failure"
    ;;"implode-toks called with null string")
    ;;(process-err '|implode|)
    ((null (cdr l))
     (car l))
    (t (intern-system (implode-toks-to-string l)))))



;; for some reason this is more efficient then the following.
;; someday it will break due to length of list being to long for apply.
(defun concat (&rest x)
  (intern-system (apply #'concatenate 'string (mapcar #'string x))))
	         

;; dolist is less efficient than mapc in this case (and probably many others).
;;(defun concat (&rest x)
;;   (intern (with-output-to-string (s)
;;          (mapc #'(lambda (sym) (princ sym s))
;;                x))
;;        *system-package*))



