Algorithm::NaiveBayes::Simple実装 - シリコンの谷のゾンビ

いきなり思い立って30分プログラミング（終わってみれば30分どころではなかった）．Perlらしいオブジェクト指向で，それなりに綺麗に書けた．ただ，データ保持のところが気に食わない．一番簡単なmultivariate bernoulli modelを実装した（単語が文書に現れるか否かをモデル化するもの．文書あたりの出現頻度は考慮しない）．

Naive Bayesをはじめ，機械学習手法を勉強するのによさげな簡単なコードってそんなに見つからないのでKY公開．Wekaのソースコード嫁とかいわないでください．NBは頻度表を保存するだけで良いので，おそらくどんな言語でも100行未満で書ける．LLだとワンライナーとかいそう…．他のNB手法も実装したいなー

研究室の導入研修にちょうど良いと思っているのだけれど，言い出せずにいる．PerlじゃなくてJavaとかRubyでも実装してみよう．

そーすこーど

package Algorithm::NaiveBayes::Simple;

use strict;
use warnings;

{
  sub new{
    my ($class) = @_;
    bless {
           _freqtable => {},  # frequency table for each class
           _word => {},       # word feature
           _count => {},      # instance count for each class
           _total_count => 0, # total instance count
          }, $class;
  }

  sub add_instance{
    my ($self, %argh) = @_;

    my $label = $argh{label};
    my $attributes_ref = $argh{attributes};

    foreach my $word (keys %{ $attributes_ref }){
      # increment only 1 (multivariate bernoulli model)
      $self->{_freqtable}->{$label}->{$word}++;

      # add word as a feature if it has NOT appeared yet
      unless(defined $self->{_word}->{$word}){
        $self->{_word}->{$word} = 1;
      }
    }

    # increment instance count
    $self->{_count}->{$label}++;
    $self->{_total_count}++;
  }

  sub predict{
    my ($self, %argh) = @_;
    my $attributes_ref = $argh{attributes};
    my %posterior_of;
    my $evidence = 0;

    foreach my $label (keys %{ $self->{_count} }){

      # calculate prior
      $posterior_of{$label} = $self->{_count}->{$label} / $self->{_total_count};

      foreach my $word (keys %{ $self->{_word} }){

        # calculate word likelihood (Laplace correction)
        my $word_frequency = 1;
        $word_frequency += $self->{_freqtable}->{$label}->{$word}
                                   if (exists $self->{_freqtable}->{$label}->{$word});
        my $word_likelihood = $word_frequency
                              / ($self->{_count}->{$label} + scalar keys %{ $self->{_word} });

        # If the word appeared
        if(exists $attributes_ref->{$word}){
          $posterior_of{$label} *= $word_likelihood;
        }

        # If NOT
        else{
          $posterior_of{$label} *= (1 - $word_likelihood);
        }
      }

      $evidence += $posterior_of{$label};
    }

    # regularize
    foreach my $label (keys %posterior_of){
      $posterior_of{$label} /= $evidence;
    }

    return \%posterior_of;
  }

}

実行例

# MAIN
package main;
{

  my $nb = Algorithm::NaiveBayes::Simple->new();


  my %attribute1 = (
                    hoge => 2,
                    piyo => 1,
                   );

  my %attribute2 = (
                    foo => 2,
                    bar => 2,
                   );

  # add instance to train classifier
  $nb->add_instance(label => "positive", attribuets => \%attribute1);
  $nb->add_instance(label => "negative", attributes => \%attribute2);


  # predict posterior probabilities
  my $result_ref1 = $nb->predict(attributes => {hoge => 1, piyo => 1});
  my $result_ref2 = $nb->predict(attributes => {foo => 1, bar => 1});


  # print
  foreach my $result_ref ($result_ref1, $result_ref2){
    print "Result:\n";
    foreach my $label (keys %{ $result_ref }){
      print "$label => $result_ref->{$label}\n";
    }
    print "---\n";
  }

}

実行結果：
Result:
positive => 0.8
negative => 0.2
---
Result:
positive => 0.2
negative => 0.8
---