|
| 1 | +# -*- mode: cperl -*- |
| 2 | +package PDL::Kohonen; |
| 3 | + |
| 4 | +use PDL; |
| 5 | +use PDL::IO::FastRaw; |
| 6 | + |
| 7 | +@ISA = qw(PDL); |
| 8 | + |
| 9 | +### TO DO: load+save methods (simply with fraw) |
| 10 | + |
| 11 | + |
| 12 | +# data is always D x N |
| 13 | +# where D is the dimensionality of the data |
| 14 | +# and there are N data points |
| 15 | +# maps are always D x W x H |
| 16 | + |
| 17 | +sub initialize { |
| 18 | + my $class = shift; |
| 19 | + my $self = { |
| 20 | + PDL => null, # used to store PDL object |
| 21 | + }; |
| 22 | + bless $self, $class; |
| 23 | +} |
| 24 | + |
| 25 | +## |
| 26 | +# init method |
| 27 | +## |
| 28 | +# usage: $map->init($data, 10, 6, 4); |
| 29 | +# initialises a 10x6x4 map randomly based on the min/max |
| 30 | +# of the data in $data piddle |
| 31 | +# or: $map->init($perlarrayref, 8, 7) |
| 32 | +# initialises a 8x7 map as above but uses a reference |
| 33 | +# to a perl array of individual data point piddles |
| 34 | +# [note: arrays of many piddles seem to take up memory] |
| 35 | +## |
| 36 | +sub init { |
| 37 | + my ($self, $data, @mdims) = @_; |
| 38 | + die "init() arguments: data, map_dimensions\n" |
| 39 | + unless (defined $data && @mdims > 0); |
| 40 | + |
| 41 | + if (ref($data) eq 'ARRAY') { |
| 42 | + $data = cat(@$data); |
| 43 | + } |
| 44 | + |
| 45 | + my $max = $data->mv(-1,0)->maximum; |
| 46 | + my $min = $data->mv(-1,0)->minimum; |
| 47 | + |
| 48 | + my @ddims = $data->dims; |
| 49 | + my $n = pop @ddims; |
| 50 | + $self->{mapdims} = \@mdims; # map dimensions, e.g. 6 x 4 |
| 51 | + $self->{nmapdims} = scalar @mdims; |
| 52 | + $self->{mapvolume} = ones(@mdims)->nelem; |
| 53 | + $self->{datadims} = \@ddims; # data dimensions, e.g. 20 x 15 |
| 54 | + $self->{ndatadims} = scalar @ddims; |
| 55 | + ($self->{sdim}) = sort { $a<=>$b } @mdims; # smallest map dimension size |
| 56 | + $self->{PDL} = random(@ddims, @mdims); |
| 57 | + $self->{PDL} *= ($max-$min)+$min; |
| 58 | + return $self; |
| 59 | +} |
| 60 | + |
| 61 | +sub train { |
| 62 | + my ($self, $data, $p) = @_; |
| 63 | + |
| 64 | + if (ref($data) eq 'ARRAY') { |
| 65 | + $data = cat(@$data); |
| 66 | + } |
| 67 | + |
| 68 | + my $n = $data->dim(-1); |
| 69 | + my $dta = $data->mv(-1, 0); |
| 70 | + |
| 71 | + die "you must init() the map first\n" unless ($self->{mapdims}); |
| 72 | + |
| 73 | + my $alpha = defined $p->{alpha} ? $p->{alpha} : 0.1; |
| 74 | + my $radius = defined $p->{radius} ? $p->{radius} : $self->{sdim}/2; |
| 75 | + |
| 76 | + $p->{epochs} = 1 unless (defined $p->{epochs}); |
| 77 | + $p->{order} = 'random' unless (defined $p->{order}); # also: linear |
| 78 | + |
| 79 | + $p->{ramp} = 'linear' unless (defined $p->{ramp}); # also: off |
| 80 | + |
| 81 | + my $winnerfunc = $p->{winnerfunc} || \&euclidean_winner; |
| 82 | + $p->{progress} = 'on' unless (defined $p->{progress}); |
| 83 | + |
| 84 | + my ($dalpha, $dradius) = (0, 0); |
| 85 | + if ($p->{ramp} =~ /linear/i) { |
| 86 | + $dalpha = $alpha/$p->{epochs}; |
| 87 | + $dradius = $radius/$p->{epochs}; |
| 88 | + } |
| 89 | + |
| 90 | + my $progformat = $p->{progress} =~ /on/i ? |
| 91 | + "\rradius %2d alpha %6.4f data %5d of %5d epoch %3d of %3d" : ''; |
| 92 | + |
| 93 | + my $ordertype = 0; |
| 94 | + $ordertype = 1 if ($p->{order} =~ /linear/i); |
| 95 | + |
| 96 | + if ($p->{progress} =~ /on/) { |
| 97 | + printf "training map (%s) with %d data points (%s) for %d epochs...\n", |
| 98 | + join("x", @{$self->{mapdims}}), $n, |
| 99 | + join("x", @{$self->{datadims}}), $p->{epochs}; |
| 100 | + } |
| 101 | + local $| = 1; |
| 102 | + |
| 103 | + for (my $e=0; $e<$p->{epochs}; $e++) { |
| 104 | + for (my $i=0; $i<$n; $i++) { |
| 105 | + printf $progformat, $radius, $alpha, $i+1, $n, $e+1, $p->{epochs}; |
| 106 | + my $d = $ordertype ? $i : int(rand($n)); |
| 107 | + |
| 108 | + my $vec = $dta->slice("($d)"); |
| 109 | + |
| 110 | + my @w = $self->$winnerfunc($vec); |
| 111 | + |
| 112 | + for (my $r=$radius; $r>=0; $r--) { |
| 113 | + my $hood = $self->hood($r, @w); |
| 114 | + $hood -= ($hood-$vec)*$alpha; |
| 115 | + } |
| 116 | + } |
| 117 | + $alpha -= $dalpha; |
| 118 | + $radius -= $dradius; |
| 119 | + } |
| 120 | + print "\n" if ($progformat); |
| 121 | +} |
| 122 | + |
| 123 | + |
| 124 | +# runs your data through a trained map |
| 125 | +# returns two piddles |
| 126 | +# - winning node coordinates (ushort M x N) |
| 127 | +# - quantisation error (double N) |
| 128 | +sub apply { |
| 129 | + my ($self, $data, $p) = @_; |
| 130 | + my $winnerfunc = $p->{winnerfunc} || \&euclidean_winner; |
| 131 | + $p->{progress} = 'on' unless (defined $p->{progress}); |
| 132 | + |
| 133 | + if (ref($data) eq 'ARRAY') { |
| 134 | + $data = cat(@$data); |
| 135 | + } |
| 136 | + |
| 137 | + my $n = $data->dim(-1); |
| 138 | + my $dta = $data->mv(-1, 0); |
| 139 | + |
| 140 | + my $mds = join("x", @{$self->{mapdims}}); |
| 141 | + my $progformat = $p->{progress} =~ /on/i ? |
| 142 | + "\rapplying map ($mds) to data point %5d of %5d" : ''; |
| 143 | + |
| 144 | + local $| = 1; |
| 145 | + |
| 146 | + my $winvecs = zeroes ushort, $n, $self->{nmapdims}; |
| 147 | + my $errors = zeroes $n; |
| 148 | + |
| 149 | + my $error = 0; |
| 150 | + for (my $i=0; $i<$n; $i++) { |
| 151 | + printf $progformat, $i+1, $n; |
| 152 | + my $vec = $dta->slice("($i)"); |
| 153 | + $winvecs->slice("($i)") .= ushort($self->$winnerfunc($vec, \$error)); |
| 154 | + set($errors, $i, $error); |
| 155 | + } |
| 156 | + print "\n" if ($progformat); |
| 157 | + |
| 158 | + return ($winvecs->mv(0, -1), $errors); |
| 159 | +} |
| 160 | + |
| 161 | +sub euclidean_winner { |
| 162 | + my ($self, $vec, $qref) = @_; |
| 163 | + |
| 164 | + my $d = $vec - $self; |
| 165 | + $d *= $d; |
| 166 | + while ($d->ndims > $self->{nmapdims}) { |
| 167 | + $d = $d->sumover(); |
| 168 | + } |
| 169 | + my @d = $d->dims(); |
| 170 | + my ($i) = $d->flat->qsorti->list; |
| 171 | + |
| 172 | + # pass the error back through a reference, if given |
| 173 | + if ($qref && ref($qref)) { |
| 174 | + $$qref = sqrt($d->flat->at($i)); |
| 175 | + } |
| 176 | + |
| 177 | + return $self->unflattenindex($i); |
| 178 | +} |
| 179 | + |
| 180 | +sub unflattenindex { |
| 181 | + my ($self, $i) = @_; |
| 182 | + my @result; |
| 183 | + my $volume = $self->{mapvolume}; |
| 184 | + foreach my $dim (reverse @{$self->{mapdims}}) { |
| 185 | + $volume = $volume/$dim; |
| 186 | + my $index = int($i/$volume); |
| 187 | + unshift @result, $index; |
| 188 | + $i -= $index*$volume; |
| 189 | + } |
| 190 | + return @result; |
| 191 | +} |
| 192 | + |
| 193 | +sub hood { |
| 194 | + my ($self, $radius, @coords) = @_; |
| 195 | + $radius = int($radius); |
| 196 | + |
| 197 | + my $slice = ',' x ($self->{ndatadims}-1); |
| 198 | + |
| 199 | + if ($radius == 0) { |
| 200 | + return $self->slice(join ',', $slice, @coords); |
| 201 | + } |
| 202 | + |
| 203 | + for (my $i=0; $i<@coords; $i++) { |
| 204 | + my ($left, $right) = ($coords[$i]-$radius, $coords[$i]+$radius); |
| 205 | + $left = 0 if ($left < 0); |
| 206 | + $right = $self->{mapdims}->[$i] - 1 if ($right >= $self->{mapdims}->[$i]); |
| 207 | + $slice .= ",$left:$right"; |
| 208 | + } |
| 209 | + return $self->slice($slice); |
| 210 | +} |
| 211 | + |
| 212 | +sub save { |
| 213 | + my ($self, $filename) = @_; |
| 214 | + $self->writefraw($filename); |
| 215 | + open(HDR, ">>$filename.hdr") || die "can't append to $filename.hdr"; |
| 216 | + foreach $key (qw(nmapdims ndatadims sdim mapvolume)) { |
| 217 | + print HDR "PDL::Kohonen $key $self->{$key}\n"; |
| 218 | + } |
| 219 | + close(HDR); |
| 220 | +} |
| 221 | + |
| 222 | +sub load { |
| 223 | + my ($self, $filename) = @_; |
| 224 | + $self->{PDL} = readfraw($filename); |
| 225 | + |
| 226 | + open(HDR, "$filename.hdr") || die "can't open $filename.hdr"; |
| 227 | + while (<HDR>) { |
| 228 | + if (/^PDL::Kohonen/) { |
| 229 | + chomp; |
| 230 | + my ($dum, $key, $val) = split ' ', $_, 3; |
| 231 | + $self->{$key} = $val; |
| 232 | + } |
| 233 | + } |
| 234 | + close(HDR); |
| 235 | + |
| 236 | + die "couldn't find header information while loading map\n" |
| 237 | + unless ($self->{nmapdims} && $self->{ndatadims} && $self->{sdim}); |
| 238 | + |
| 239 | + my @dims = $self->dims; |
| 240 | + my @mdims = splice @dims, -$self->{nmapdims}; |
| 241 | + $self->{mapdims} = \@mdims; |
| 242 | + $self->{datadims} = \@dims; |
| 243 | +} |
| 244 | + |
| 245 | + |
| 246 | + |
| 247 | +sub quantiseOLDCODE { |
| 248 | + my ($map, $data, $distfunc) = @_; |
| 249 | + $map = $map->clump(1,2); |
| 250 | + my $mapsize = $map->dim(1); |
| 251 | + my $dupdata = $data->dummy(1, $mapsize); |
| 252 | + my $d = $dupdata - $map; |
| 253 | + $d *= $d; |
| 254 | + $d = $d->sumover(); |
| 255 | + my $i = $d->qsorti->slice("(0),:"); # get the map index of smallest dists |
| 256 | + for (my $dim = 0; $dim<$map->dim(0); $dim++) { |
| 257 | + $data->slice("($dim)") .= $map->slice("($dim)")->index($i); |
| 258 | + } |
| 259 | +} |
| 260 | + |
| 261 | + |
| 262 | +1; |
0 commit comments