NaiveBayes in Lisp

LispでNaiveBayesを実装してみた.ハッシュの力に頼らざるを得なかったし,不必要なところでもハッシュを多用している.明らかにPerl病なコードが完成した.
コメント空行込みで130行程度.勉強時間,ハマった時間を含めて3時間程度かかった.


書き終えて思ったのだけれど,Lispコードって別に見づらくなくない?ということ.自分で書いたからかもしれないけれど,イマドキのエディタは自動インデントや括弧の対応をやってくれるので,関数やマクロの命名法さえ身につけば,見やすいコードになるのでは,と思った.思うだけはタダ.


ちょっとだけ自分をほめたいポイント

  • なんにせよ書き終えた
  • nilの場合,ほげほげするという処理で,自然とor関数を使えた
  • 長ったらしい処理部分を自前関数で書いた
; create classifier
(defun make-classifier ()
  (setq freq-hash-table (make-hash-table))
  (setq feature-hash (make-hash-table))
  (setq instance-count-hash (make-hash-table))
  (setq total-word-count-hash (make-hash-table))
  (setq total-instance-count 0))


;; add instance
(defun add-instance (label attributes)
  ; If input lable is new-face, create new hash for the label
  (if (null (gethash label freq-hash-table))
      (and (setf (gethash label freq-hash-table) (make-hash-table))
	   ; set total word count as 0
	   (setf (gethash label total-word-count-hash) 0)))

  ; for each word, add frequency
  (dolist (word attributes)
    (if (gethash word (gethash label freq-hash-table))
	(incf (gethash word (gethash label freq-hash-table)))
      (setf (gethash word (gethash label freq-hash-table)) 1))
    
    ; add to feature hash
    (if (null (gethash word feature-hash))
	(setf (gethash word feature-hash) t)))

  ; add instance count
  (if (null (gethash label instance-count-hash))
      (setf (gethash label instance-count-hash) 1)
    (incf (gethash label instance-count-hash)))
  (setq total-instance-count (1+ total-instance-count))

  ; add total word count
  (setf (gethash label total-word-count-hash)
	(+ (gethash label total-word-count-hash)
	   (length attributes)))
  )


;; train classifier
(defun train ()
  (setq word-likelihood-hash (make-hash-table))

  ; for each labels
  (dolist (label (hash-keys freq-hash-table))
    ; create new hash for the label
    (or (gethash label word-likelihood-hash)
	(setf (gethash label word-likelihood-hash) (make-hash-table)))
    
    (dolist (feature (hash-keys feature-hash))
      
      ; set word likelihood estimated by laplace smoothing
      (setf (gethash feature (gethash label word-likelihood-hash))
	    (/
	     (1+ (or (gethash feature (gethash label freq-hash-table)) 0))
	     (+ (hash-table-count feature-hash)
	(gethash label total-word-count-hash))))
      )))


;; calculate prior
(defun calc-prior (label)
  (/ (gethash label instance-count-hash)
     total-instance-count))



;; calc posteriors
(defun calc-posteriors (attributes)
  ; create attributes hash
  (let ((attribute-hash (make-hash-table))
	(result-hash (make-hash-table))
	(evidence 0)
	(result-lis nil))

    ; create attributes hash
    (dolist (att attributes)
      (hash++ att attribute-hash))

    ; calculate posterior for each label
    (dolist (label (hash-keys freq-hash-table))
      (let ((tmp-posterior 0))
	(maphash #'(lambda (att freq)
	   (setq tmp-posterior
	 (+ tmp-posterior
	    (* (log (gethash att (gethash label word-likelihood-hash)))
	       freq)))) attribute-hash)
	; save posterior without normalization
	(setf (gethash label result-hash)
	      (* (exp tmp-posterior)
	 (calc-prior label)))
	; add to evidence
	(setq evidence (+ evidence
	  (gethash label result-hash)))
	))

    ; normalize posterior
    (maphash #'(lambda (label posterior)
	       (setf (gethash label result-hash) (/ posterior evidence)))
	     result-hash)

    result-hash))


;; predict
(defun predict (attributes)
  (let ((att-hash (calc-posteriors attributes))
	(result-lis nil))
    (maphash #'(lambda (label prob)
	 (setq result-lis (cons (list label prob) result-lis)))
	     att-hash)
    result-lis))


;; utility functions
;; get hash key list
(defun hash-keys (hash)
  (let ((lis nil))
    (maphash #'(lambda (key value)
	 (setq lis (cons key lis)))
	     hash)
    lis))

;; increment hash value 1
(defun hash++ (key hash)
  (if (null (gethash key hash))
      (setf (gethash key hash) 1)
    (setf (gethash key hash) (1+ (gethash key hash)))))

;; increment hash value n
(defun hash+num (key num hash)
  (if (null (gethash key hash))
      (setf (gethash key hash) num)
    (setf (gethash key hash) (+ (gethash key hash) num))))

多分,ちゃんと動いているはず

; for test
(make-classifier)
(add-instance 'pos '(hoge foo bar hoge foo bar))
(add-instance 'pos '(hoge hoge foo bar))
(add-instance 'neg '(piyo hoge bar piyo hoge))
(add-instance 'neg '(piyo bar foo hoge))
(train)

(predict '(hoge foo))
;=> ((neg 0.3168957) (pos 0.6831043))

(predict '(piyo bar))
;=> ((neg 0.7767503) (pos 0.2232497))

(predict '(foo piyo))
;=> ((neg 0.6987523) (pos 0.3012478))