Algorithm::NaiveBayes::Simple実装
いきなり思い立って30分プログラミング(終わってみれば30分どころではなかった).Perlらしいオブジェクト指向で,それなりに綺麗に書けた.ただ,データ保持のところが気に食わない.一番簡単なmultivariate bernoulli modelを実装した(単語が文書に現れるか否かをモデル化するもの.文書あたりの出現頻度は考慮しない).
Naive Bayesをはじめ,機械学習手法を勉強するのによさげな簡単なコードってそんなに見つからないのでKY公開.Wekaのソースコード嫁とかいわないでください.NBは頻度表を保存するだけで良いので,おそらくどんな言語でも100行未満で書ける.LLだとワンライナーとかいそう….他のNB手法も実装したいなー
研究室の導入研修にちょうど良いと思っているのだけれど,言い出せずにいる.PerlじゃなくてJavaとかRubyでも実装してみよう.
そーすこーど
package Algorithm::NaiveBayes::Simple; use strict; use warnings; { sub new{ my ($class) = @_; bless { _freqtable => {}, # frequency table for each class _word => {}, # word feature _count => {}, # instance count for each class _total_count => 0, # total instance count }, $class; } sub add_instance{ my ($self, %argh) = @_; my $label = $argh{label}; my $attributes_ref = $argh{attributes}; foreach my $word (keys %{ $attributes_ref }){ # increment only 1 (multivariate bernoulli model) $self->{_freqtable}->{$label}->{$word}++; # add word as a feature if it has NOT appeared yet unless(defined $self->{_word}->{$word}){ $self->{_word}->{$word} = 1; } } # increment instance count $self->{_count}->{$label}++; $self->{_total_count}++; } sub predict{ my ($self, %argh) = @_; my $attributes_ref = $argh{attributes}; my %posterior_of; my $evidence = 0; foreach my $label (keys %{ $self->{_count} }){ # calculate prior $posterior_of{$label} = $self->{_count}->{$label} / $self->{_total_count}; foreach my $word (keys %{ $self->{_word} }){ # calculate word likelihood (Laplace correction) my $word_frequency = 1; $word_frequency += $self->{_freqtable}->{$label}->{$word} if (exists $self->{_freqtable}->{$label}->{$word}); my $word_likelihood = $word_frequency / ($self->{_count}->{$label} + scalar keys %{ $self->{_word} }); # If the word appeared if(exists $attributes_ref->{$word}){ $posterior_of{$label} *= $word_likelihood; } # If NOT else{ $posterior_of{$label} *= (1 - $word_likelihood); } } $evidence += $posterior_of{$label}; } # regularize foreach my $label (keys %posterior_of){ $posterior_of{$label} /= $evidence; } return \%posterior_of; } }
実行例
# MAIN package main; { my $nb = Algorithm::NaiveBayes::Simple->new(); my %attribute1 = ( hoge => 2, piyo => 1, ); my %attribute2 = ( foo => 2, bar => 2, ); # add instance to train classifier $nb->add_instance(label => "positive", attribuets => \%attribute1); $nb->add_instance(label => "negative", attributes => \%attribute2); # predict posterior probabilities my $result_ref1 = $nb->predict(attributes => {hoge => 1, piyo => 1}); my $result_ref2 = $nb->predict(attributes => {foo => 1, bar => 1}); # print foreach my $result_ref ($result_ref1, $result_ref2){ print "Result:\n"; foreach my $label (keys %{ $result_ref }){ print "$label => $result_ref->{$label}\n"; } print "---\n"; } } 実行結果: Result: positive => 0.8 negative => 0.2 --- Result: positive => 0.2 negative => 0.8 ---