;;; unicode.el --- for UNICODE special features

;; Copyright (C) 1997-1999 Miyashita Hisashi

;; Keywords: mule, multilingual, 
;;           character set, coding-system, ISO10646, Unicode

;; This file is part of MULE-UCS

;; MULE-UCS is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; MULE-UCS is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with this program; see the file COPYING.  If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;; Comment:
;;  This module supports unicode translations.

(require 'mucs)
(require 'trans-util)
(require 'tae)

(require 'un-data)

(defconst utf-8-encode-buffer-magnification 2)
(defconst utf-8-decode-buffer-magnification 2)
(defconst utf-16-encode-buffer-magnification 2)
(defconst utf-16-decode-buffer-magnification 2)

(defvar mucs-unicode-default-decode-replacement ??)
(defvar mucs-unicode-default-encode-replacement ?\xFFFD)
(defvar mucs-unicode-charset-translation-max-gap t)

(defvar mucs-ccl-unicode-translation-table-number 1)

(defun mucs-unicode-default-encoding (x)
  (cons (char-codepoint (car x)) (cdr x)))

(defun utf-16-ccl-surrogate-pair-p (reg)
  `((,reg & ?\xf800) == ?\xd800))

;;;
;;; UCS generic type definition.
;;;
(mucs-define-type
 'ucs-generic
 'identity
 'identity)

;;;
;;; UCS replacement or ignore translation rule.
;;;

(defvar unicode-not-found-assoc
  `(assoc (char-1 . ucs-generic)
	  ( ,@(mapcar
	       (lambda (x)
		 (cons 'invalid x))
	       unicode-ignore-characters)
	      (all . ,mucs-unicode-default-encode-replacement)
	      (,mucs-unicode-default-decode-replacement . all)))
  "Translate any values to replacement character or invalid code.
If you want to deal with untranslated character, use this translation rule.")

;;
;; Dealing with line separator problem.
;;

(defun convert-unicode-lf-2-crlf (cr-output)
  `((if (r0 == ,unicode-lf)
	,(append cr-output))))

(defvar lf-vs-cr-assoc
  `(assoc (char-1 . ucs-generic)
	  ((?\xa . ,unicode-cr))))

(defvar lf-vs-unicode-line-separator-assoc
  `(assoc (char-1 . ucs-generic)
	  ((?\xa . ,unicode-line-separator))))

;;
;; WRITE SIGNATURE, CHECK AND READ SIGNATURE
;; READ, WRITE
;;

(defvar ucs4-be-ccl-encode
  mucs-ccl-write-ex-be-4-octet)

(defvar ucs4-le-ccl-encode
  mucs-ccl-write-ex-le-4-octet)

;UTF 8 ------------------------------------------------

(defvar utf-8-ccl-encode
  `((if (r0 < ?\x80)
	((write r0))
      (if (r0 < ?\x800)
	  ((write ((r0 >> 6) | ?\xc0))
	   (write ((r0 & ?\x3f) | ?\x80)))
	(if (r0 < ?\x10000)
	    ((write ((r0 >> 12) | ?\xe0))
	     (write (((r0 >> 6) & ?\x3f) | ?\x80))
	     (write ((r0 & ?\x3f) | ?\x80)))
	  (if (r0 < ?\x200000)
	      ((write ((r0 >> 18) | ?\xf0))
	       (write (((r0 >> 12) & ?\3f) | ?\x80))
	       (write (((r0 >> 6) & ?\x3f) | ?\x80))
	       (write ((r0 & ?\x3f) | ?\x80)))
	    (if (r0 < ?\x4000000)
		((write ((r0 >> 24) | ?\xf8))
		 (write (((r0 >> 18) & ?\x3f) | ?\x80))
		 (write (((r0 >> 12) & ?\x3f) | ?\x80))
		 (write (((r0 >> 6) & ?\x3f) | ?\x80))
		 (write ((r0 & ?\x3f) | ?\x80)))
	      ((write ((r0 >> 30) | ?\xfc))
	       (write (((r0 >> 24) & ?\x3f) | ?\x80))
	       (write (((r0 >> 18) & ?\x3f) | ?\x80))
	       (write (((r0 >> 12) & ?\x3f) | ?\x80))
	       (write (((r0 >> 6) & ?\x3f) | ?\x80))
	       (write ((r0 & ?\x3f) | ?\x80))))))))))

(defvar utf-8-ccl-decode
  `((read-if (r0 >= ?\x80)
	((if (r0 < ?\xe0)
	     ((read r4)
	      (r4 &= ?\x3f)
	      (r0 = (((r0 & ?\x1f) << 6) | r4)))
	   (if (r0 < ?\xf0)
	       ((read r4 r6)
		(r4 = ((r4  & ?\x3f) << 6))
		(r6 &= ?\x3f)
		(r0 = ((((r0 & ?\xf) << 12) | r4) | r6)))
	     (if (r0 < ?\xf8)
		 ((read r1 r4 r6)
		  (r1 = ((r1  & ?\x3f) << 12))
		  (r4 = ((r4  & ?\x3f) << 6))
		  (r6 &= ?\x3f)
		  (r0 = (((((r0 & ?\x7) << 18) | r1) | r4) | r6)))
	       (if (r0 < ?\xfc)
;;;; MUCS can't read any numbers lager than 24bit
		   ((read r0 r1 r4 r6)
		    (r1 = ((r1  & ?\x3f) << 12))
		    (r4 = ((r4  & ?\x3f) << 6))
		    (r6 &= ?\x3f)
		    (r0 = (((((r0 & ?\x3f) << 18) | r1) | r4) | r6)))
		 (r0 = 0)))))))))

(mucs-type-register-serialization
 'ucs-generic
 'utf-8
 utf-8-ccl-encode
 utf-8-ccl-decode)

(mucs-type-register-serialization
 'ucs-generic
 'utf-8-dos
 (append
  (convert-unicode-lf-2-crlf '((write ?\xd)))
  utf-8-ccl-encode)
 'none)

(defun mucs-ccl-write-utf-8-signature ()
  '((write "\xEF\xBB\xBF")))

(defun mucs-ccl-utf-8-check-signature-read ()
  (append
   utf-8-ccl-decode
   `((if (r0 == ,unicode-signature)
	 ,utf-8-ccl-decode))))

;UTF 16 -----------------------------------------------
;;;;
;;;; If register (r5 & ?\x1) is 1, current mode is little endian.
;;;;

(defun mucs-ccl-utf-16-little-endian-p ()
  '(r5 & ?\x1))

(defun mucs-ccl-set-utf-16-endian (littlep)
  (if littlep
      '((r5 |= ?\x1))
    `((r5 &= ,(logxor (mucs-number-mask) ?\x1)))))

(defvar utf-16-ccl-decode
  `((if ,(mucs-ccl-utf-16-little-endian-p)
	,mucs-ccl-read-ex-le-2-octet
      ,mucs-ccl-read-ex-be-2-octet)
    (if ,(utf-16-ccl-surrogate-pair-p 'r0)
	((if ,(mucs-ccl-utf-16-little-endian-p)
	     ((read r6 r4))
	   ((read r4 r6)))
	 (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
	 (r6 &= ?\x3f)
	 (r4 = ((r4 & ?\x3) << 6) | r6)
	 (r0 |=  r4)))))

(defun mucs-ccl-utf-16-check-signature-read ()
  (append mucs-ccl-read-ex-le-2-octet
	  `((if (r0 == ,unicode-signature)
		,(append (mucs-ccl-set-utf-16-endian t)
			 mucs-ccl-read-ex-le-2-octet)
	      (if (r0 == ,unicode-reverse-signature)
		  ,(append (mucs-ccl-set-utf-16-endian nil)
			   mucs-ccl-read-ex-be-2-octet)))
	    (if ,(utf-16-ccl-surrogate-pair-p 'r0)
		((if ,(mucs-ccl-utf-16-little-endian-p)
		     ((read r6 r4))
		   ((read r4 r6)))
		 (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
		 (r6 &= ?\x3f)
		 (r4 = ((r4 & ?\x3) << 6) | r6)
		 (r0 |=  r4))))))

(defun mucs-ccl-read-utf-16 ()
  utf-16-ccl-decode)

;UTF 16 Little Endian----------------------------------

(defvar utf-16-le-ccl-encode
  `((if (r0 < ?\xffff)
	,mucs-ccl-write-ex-le-2-octet
      ((r4 = (((r0 >> 16) - 1) & ?\xf))
       (r6 = ((r0 >> 10) & ?\x3f))
       (write ((r4 & ?\x3) | r6))
       (write ((r4 >> 2) | ?\xd8))
       (write (r0 & ?\7f))
       (write (((r0 >> 8) & ?\x3) | ?\xdc))))))

(defvar utf-16-le-ccl-decode
  (append mucs-ccl-read-ex-le-2-octet
    `((if ,(utf-16-ccl-surrogate-pair-p 'r0)
	  ((read r6 r4)
	   (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
	   (r6 &= ?\x3f)
	   (r4 = ((r4 & ?\x3) << 6) | r6)
	   (r0 |=  r4))))))

(mucs-type-register-serialization
 'ucs-generic
 'utf-16-le
 utf-16-le-ccl-encode
 utf-16-le-ccl-decode)

(mucs-type-register-serialization
 'ucs-generic
 'utf-16-le-dos
 (append
  (convert-unicode-lf-2-crlf '((write "\x0D\x00")))
  utf-16-le-ccl-encode)
 'none)

(defun mucs-ccl-write-utf-16-le-signature ()
  '((write "\xFF\xFE")))

;UTF 16 Big Endian-------------------------------------

(defvar utf-16-be-ccl-decode
  (append mucs-ccl-read-ex-be-2-octet
    `((if ,(utf-16-ccl-surrogate-pair-p 'r0)
	  ((read r4 r6)
	   (r0 = (((r0 & ?\x3ff) + ?\x40) << 10))
	   (r6 &= ?\x3f)
	   (r4 = ((r4 & ?\x3) << 6) | r6)
	   (r0 |=  r4))))))

(defvar utf-16-be-ccl-encode
  `((if (r0 < ?\xffff)
	,mucs-ccl-write-ex-be-2-octet
      ((r4 = (((r0 >> 16) - 1) & ?\xf))
       (r6 = ((r0 >> 10) & ?\x3f))
       (write ((r4 >> 2) | ?\xd8))
       (write ((r4 & ?\x3) | r6))
       (write (((r0 >> 8) & ?\x3) | ?\xdc))
       (write (r0 & ?\7f))))))

(mucs-type-register-serialization
 'ucs-generic
 'utf-16-be
 utf-16-be-ccl-encode
 utf-16-be-ccl-decode)

(mucs-type-register-serialization
 'ucs-generic
 'utf-16-be-dos
 (append
  (convert-unicode-lf-2-crlf '((write "\x00\x0D")))
  utf-16-be-ccl-encode)
 'none)

(defun mucs-ccl-write-utf-16-be-signature ()
  '((write ?\xfe) (write ?\xff)))

;------------------------------------------------------

(defvar unicode-charset-library-alist
  '((ascii . uascii)
    (latin-iso8859-1 . uiso8859-1)
    (latin-iso8859-2 . uiso8859-2)
    (latin-iso8859-3 . uiso8859-3)
    (latin-iso8859-4 . uiso8859-4)
    (cyrillic-iso8859-5 . uiso8859-5)
    (arabic-iso8859-6 . uiso8859-6)
    (greek-iso8859-7 . uiso8859-7)
    (hebrew-iso8859-8 . uiso8859-8)
    (latin-iso8859-9 . uiso8859-9)
    (latin-jisx0201 . ujisx0201)
    (katakana-jisx0201 . ujisx0201)
    (japanese-jisx0208 . ujisx0208)
    (japanese-jisx0212 . ujisx0212)
    (chinese-gb2312 . ugb2312)
    (chinese-cns11643-1 . u-cns-1)
    (chinese-cns11643-2 . u-cns-2)
    (chinese-cns11643-3 . u-cns-3)
    (chinese-cns11643-4 . u-cns-4)
    (chinese-cns11643-5 . u-cns-5)
    (chinese-cns11643-6 . u-cns-6)
    (chinese-cns11643-7 . u-cns-7)
    (korean-ksc5601 . uksc5601)
    (ipa . uipa)))

(defun require-unicode-charset-data (charset)
  (let ((package (cdr (assq charset unicode-charset-library-alist))))
    (or (featurep package)
	(load (expand-file-name (symbol-name package)
				mucs-data-path)
	      t)
	(require package))))

(provide 'unicode)

