diff --git a/src/Python/doc/source/reference.rst b/src/Python/doc/source/reference.rst index dc4e48a..66431b7 100644 --- a/src/Python/doc/source/reference.rst +++ b/src/Python/doc/source/reference.rst @@ -2,7 +2,7 @@ Function Reference ****************** -.. py:class:: Somoclu(n_columns, n_rows, initialcodebook=None, kerneltype=0, maptype='planar', gridtype='rectangular', compactsupport=False, neighborhood='gaussian', std_coeff=0.5, initialization=None) +.. py:class:: Somoclu(n_columns, n_rows, initialcodebook=None, kerneltype=0, maptype='planar', gridtype='rectangular', compactsupport=False, neighborhood='gaussian', vect_distance='euclidean', std_coeff=0.5, initialization=None) Class for training and visualizing a self-organizing map. @@ -40,6 +40,11 @@ Function Reference * "gaussian": Gaussian neighborhood (default) * "bubble": bubble neighborhood function + :param vect_distance: Optional parameter to specify the vector distance: + + * "euclidean": Euclidean distance (default) + * "norm-inf": Infinity-norm distance (max among components) + * "norm-p": p-norm (p-th root of summed differences raised to the power of p), works only if kerneltype is 0 :type neighborhood: str. :param std_coeff: Optional parameter to set the coefficient in the Gaussian neighborhood function exp(-||x-y||^2/(2*(coeff*radius)^2)) diff --git a/src/Python/somoclu/somoclu.i b/src/Python/somoclu/somoclu.i index c1d8848..8ea9c9d 100644 --- a/src/Python/somoclu/somoclu.i +++ b/src/Python/somoclu/somoclu.i @@ -37,4 +37,4 @@ void train(float *data, int data_length, float std_coeff, unsigned int verbose, float* codebook, int codebook_size, int* globalBmus, int globalBmus_size, - float* uMatrix, int uMatrix_size); + float* uMatrix, int uMatrix_size, string vect_distance); diff --git a/src/Python/somoclu/train.py b/src/Python/somoclu/train.py index 4dd8c67..6fa32d8 100644 --- a/src/Python/somoclu/train.py +++ b/src/Python/somoclu/train.py @@ -41,6 +41,13 @@ "when you are trying to import it. Please refer to the " "documentation to see your options.") +def is_pos_real(s): + """ Returns True if s is a positive real. + """ + try: + return (float(s) > 0) + except ValueError: + return False class Somoclu(object): """Class for training and visualizing a self-organizing map. @@ -82,6 +89,12 @@ class Somoclu(object): * "gaussian": Gaussian neighborhood (default) * "bubble": bubble neighborhood function :type neighborhood: str. + :param vect_distance: Optional parameter to specify the vector distance function: + + * "euclidean": Euclidean (default) + * "norm-inf": infinite norm (max absolute distance among components) + * "norm-p": p-th root of sum of absolute differences ^ p (only supported by kerneltype 0) + :type vect_distance: str. :param std_coeff: Optional parameter to set the coefficient in the Gaussian neighborhood function exp(-||x-y||^2/(2*(coeff*radius)^2)) Default: 0.5 @@ -100,7 +113,7 @@ class Somoclu(object): def __init__(self, n_columns, n_rows, initialcodebook=None, kerneltype=0, maptype="planar", gridtype="rectangular", compactsupport=True, neighborhood="gaussian", std_coeff=0.5, - initialization=None, data=None, verbose=0): + initialization=None, data=None, verbose=0, vect_distance="euclidean"): """Constructor for the class. """ self._n_columns, self._n_rows = n_columns, n_rows @@ -109,6 +122,7 @@ def __init__(self, n_columns, n_rows, initialcodebook=None, self._grid_type = gridtype self._compact_support = compactsupport self._neighborhood = neighborhood + self._vect_distance = vect_distance self._std_coeff = std_coeff self._verbose = verbose self._check_parameters() @@ -225,7 +239,7 @@ def train(self, data=None, epochs=10, radius0=0, radiusN=1, self._kernel_type, self._map_type, self._grid_type, self._compact_support, self._neighborhood == "gaussian", self._std_coeff, self._verbose, self.codebook, self.bmus, - self.umatrix) + self.umatrix, self._vect_distance) self.umatrix.shape = (self._n_rows, self._n_columns) self.bmus.shape = (self.n_vectors, 2) self.codebook.shape = (self._n_rows, self._n_columns, self.n_dim) @@ -475,6 +489,13 @@ def _check_parameters(self): if self._neighborhood != "gaussian" and self._neighborhood != "bubble": raise Exception("Invalid parameter for neighborhood: " + self._neighborhood) + if not (self._vect_distance == "euclidean" or self._vect_distance == "norm-inf" + or (self._vect_distance[:5] == "norm-" and is_pos_real(self._vect_distance[5:]))): + raise Exception("Invalid parameter for vect_distance: " + + self._vect_distance) + if (self._vect_distance[:5] == "norm-" and self._kernel_type != 0): + raise Exception("Invalid parameter for vect_distance: " + + self._vect_distance + " when using kernel_type: " + self._kernel_type) if self._kernel_type != 0 and self._kernel_type != 1: raise Exception("Invalid parameter for kernelTye: " + self._kernel_type) diff --git a/src/R/R/Rsomoclu.R b/src/R/R/Rsomoclu.R index 704639b..986b4f3 100644 --- a/src/R/R/Rsomoclu.R +++ b/src/R/R/Rsomoclu.R @@ -10,7 +10,7 @@ Rsomoclu.train <- scaleCooling, kernelType=0, mapType="planar", gridType="rectangular", compactSupport=TRUE, neighborhood="gaussian", stdCoeff=0.5, - codebook=NULL) { + codebook=NULL, vectDistance="euclidean") { if (is.null(codebook)) { codebook <- matrix(data = 0, nrow = nSomX * nSomY, ncol = dim(input_data)[2]) codebook[1, 1] <- 1000 @@ -21,7 +21,7 @@ Rsomoclu.train <- radiusCooling, scale0, scaleN, scaleCooling, kernelType, mapType, gridType, compactSupport, neighborhood, stdCoeff, - codebook) + codebook, vectDistance) res } diff --git a/src/R/src/Rsomoclu.cpp b/src/R/src/Rsomoclu.cpp index 2a6a02a..cd3d85a 100644 --- a/src/R/src/Rsomoclu.cpp +++ b/src/R/src/Rsomoclu.cpp @@ -17,7 +17,8 @@ RcppExport SEXP Rtrain(SEXP data_p, SEXP gridType_p, SEXP compactSupport_p, SEXP neighborhood_p, SEXP stdCoeff_p, - SEXP codebook_p) { + SEXP codebook_p, + SEXP vectDistance_p) { Rcpp::NumericMatrix dataMatrix(data_p); Rcpp::NumericMatrix codebookMatrix(codebook_p); int nVectors = dataMatrix.rows(); @@ -36,6 +37,7 @@ RcppExport SEXP Rtrain(SEXP data_p, bool compactSupport = as(compactSupport_p); string mapType = as(mapType_p); string gridType = as(gridType_p); + string vectDistance = as(vectDistance_p); string neighborhood = as(neighborhood_p); int data_length = nVectors * nDimensions; float* data = new float[data_length]; @@ -63,7 +65,7 @@ RcppExport SEXP Rtrain(SEXP data_p, gridType, compactSupport, neighborhood == "gaussian", std_coeff, 0, codebook, codebook_size, globalBmus, globalBmus_size, - uMatrix, uMatrix_size); + uMatrix, uMatrix_size, vectDistance); Rcpp::NumericMatrix globalBmusMatrix(nVectors, 2); Rcpp::NumericMatrix uMatrixMatrix(nSomX, nSomY); if(codebook != NULL) { diff --git a/src/R/src/Rsomoclu_init.c b/src/R/src/Rsomoclu_init.c index 6fce317..d953879 100644 --- a/src/R/src/Rsomoclu_init.c +++ b/src/R/src/Rsomoclu_init.c @@ -8,10 +8,10 @@ */ /* .Call calls */ -extern SEXP Rtrain(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP Rtrain(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); static const R_CallMethodDef CallEntries[] = { - {"Rtrain", (DL_FUNC) &Rtrain, 17}, + {"Rtrain", (DL_FUNC) &Rtrain, 18}, {NULL, NULL, 0} }; diff --git a/src/denseCpuKernels.cpp b/src/denseCpuKernels.cpp index 95733a6..86bd21a 100644 --- a/src/denseCpuKernels.cpp +++ b/src/denseCpuKernels.cpp @@ -34,6 +34,26 @@ float EuclideanDistance::operator()(float* vec1, float* vec2) const{ return sqrt(distance); } +float NormPDistance::operator()(float* vec1, float* vec2) const{ + unsigned int nDimensions = Dim(); + double distance = 0.0f; + for (unsigned int d = 0; d < nDimensions; ++d) { + distance += pow(fabs(vec1[d] - vec2[d]), p); + } + return pow(distance, 1.0/p); +} + +float NormInfDistance::operator()(float* vec1, float* vec2) const{ + unsigned int nDimensions = Dim(); + double distance = 0.0f; + for (unsigned int d = 0; d < nDimensions; ++d) { + float tmp = fabs(vec1[d] - vec2[d]); + if (tmp > distance) + distance = tmp; + } + return distance; +} + /** Get node coords for the best matching unit (BMU) * @param coords - BMU coords * @param n - row num in the input feature file diff --git a/src/somoclu.cpp b/src/somoclu.cpp index ff4cf6f..152272f 100644 --- a/src/somoclu.cpp +++ b/src/somoclu.cpp @@ -54,6 +54,7 @@ void printUsage() { "Arguments:\n" \ " -c FILENAME Specify an initial map.codebook for the map.\n" \ " -d NUMBER Coefficient in the Gaussian neighborhood function exp(-||x-y||^2/(2*(coeff*radius)^2)) (default: 0.5)\n" \ + " -D NORM Norm to be used among vectors: euclidean, norm-inf or norm-p with any real p > 0 (default: euclidean)\n" \ " -e NUMBER Maximum number of epochs (default: " << N_EPOCH << ")\n" \ " -g TYPE Grid type: rectangular or hexagonal (default: rectangular)\n"\ " -h, --help This help text\n" \ @@ -92,7 +93,7 @@ void processCommandLine(int argc, char** argv, string *inFilename, unsigned int *kernelType, string *mapType, unsigned int *snapshots, string *gridType, unsigned int *compactSupport, - unsigned int *gaussian, float *std_coeff, + unsigned int *gaussian, string *vect_distance, float *std_coeff, unsigned int *verbose, string *initialCodebookFilename) { @@ -112,6 +113,7 @@ void processCommandLine(int argc, char** argv, string *inFilename, *gridType = "rectangular"; *compactSupport = 1; *gaussian = 1; + *vect_distance = "euclidean"; *std_coeff = 0.5; *verbose = 0; string neighborhood_function = "gaussian"; @@ -124,7 +126,7 @@ void processCommandLine(int argc, char** argv, string *inFilename, int c; extern int optind, optopt; int option_index = 0; - while ((c = getopt_long (argc, argv, "hx:y:d:e:g:k:l:m:n:p:r:s:t:v:c:L:R:T:", + while ((c = getopt_long (argc, argv, "hx:y:d:D:e:g:k:l:m:n:p:r:s:t:v:c:L:R:T:", long_options, &option_index)) != -1) { switch (c) { case 'c': @@ -162,6 +164,13 @@ void processCommandLine(int argc, char** argv, string *inFilename, cli_abort("The argument of option -n should be either bubble or Gaussian."); } break; + case 'D': + *vect_distance = optarg; + float p; + if (!(*vect_distance == "euclidean" || *vect_distance == "norm-inf" || (vect_distance->substr(0, 5) == "norm-" && sscanf(vect_distance->c_str() + 5, "%f", &p) == 1 && p > 0))) { + cli_abort("The argument of option -D should be either euclidean or norm-p (with p being a float > 0) or norm-inf."); + } + break; case 'p': *compactSupport = atoi(optarg); if (*compactSupport != 0 && *compactSupport != 1) { @@ -294,6 +303,7 @@ int main(int argc, char** argv) string gridType; unsigned int compactSupport; unsigned int gaussian; + string vect_distance; float radius0 = 0; float radiusN = 0; string radiusCooling; @@ -312,7 +322,7 @@ int main(int argc, char** argv) &scale0, &scaleN, &scaleCooling, &nSomX, &nSomY, &kernelType, &mapType, &snapshots, - &gridType, &compactSupport, &gaussian, &std_coeff, + &gridType, &compactSupport, &gaussian, &vect_distance, &std_coeff, &verbose, &initialCodebookFilename); #ifndef CUDA @@ -419,14 +429,27 @@ int main(int argc, char** argv) cout << endl; } } - som map = { + Distance *d = 0; + if (vect_distance == "norm-inf") { + d = new NormInfDistance(nDimensions); + } else if (strncmp(vect_distance.c_str(), "norm-", 5) == 0) { + float p; + if (sscanf("%f", vect_distance.c_str() + 5, &p) == 1 && p > 0) { + d = new NormPDistance(nDimensions, p); + } else { + cerr << "norm-p needs a positive p value (falling back to Euclidean)" << endl; + } + } + if (d == 0) + d = new EuclideanDistance(nDimensions); + som map = { nSomX, nSomY, nDimensions, nVectors, mapType, gridType, - *(new EuclideanDistance(nDimensions)), + *d, NULL, new float[nSomY * nSomX * nDimensions], NULL }; diff --git a/src/somoclu.h b/src/somoclu.h index b9548b4..4539db6 100644 --- a/src/somoclu.h +++ b/src/somoclu.h @@ -75,6 +75,22 @@ class EuclideanDistance: public Distance{ virtual float operator()(float* vec1, float* vec2) const; }; +class NormPDistance: public Distance{ +private: + float p; +public: + NormPDistance(unsigned int d, float p):Distance(d), p(p){} + virtual ~NormPDistance(){} + virtual float operator()(float* vec1, float* vec2) const; +}; + +class NormInfDistance: public Distance{ +public: + NormInfDistance(unsigned int d):Distance(d){} + virtual ~NormInfDistance(){} + virtual float operator()(float* vec1, float* vec2) const; +}; + /// Som parameters struct som { unsigned int nSomX; @@ -130,7 +146,7 @@ void train(float *data, int data_length, float std_coeff, unsigned int verbose, float* codebook, int codebook_size, int* globalBmus, int globalBmus_size, - float* uMatrix, int uMatrix_size); + float* uMatrix, int uMatrix_size, string vect_distance = "euclidean"); void train(int itask, float *data, svm_node **sparseData, som map, unsigned int nVectorsPerRank, unsigned int nEpoch, float radius0, float radiusN, diff --git a/src/training.cpp b/src/training.cpp index 09b98bf..b868c7c 100644 --- a/src/training.cpp +++ b/src/training.cpp @@ -20,6 +20,7 @@ #include #include +#include #ifndef HAVE_MPI #include #endif @@ -89,7 +90,7 @@ void train(float *data, int data_length, unsigned int nEpoch, float std_coeff, unsigned int verbose, float *codebook, int codebook_size, int *globalBmus, int globalBmus_size, - float *uMatrix, int uMatrix_size) { + float *uMatrix, int uMatrix_size, string vect_distance) { #ifdef HAVE_R #ifndef CUDA if(kernelType == DENSE_GPU){ @@ -98,6 +99,16 @@ void train(float *data, int data_length, unsigned int nEpoch, } #endif // CUDA #endif // HAVE_R + Distance *d = 0; + float p; + if (vect_distance == "norm-inf") + d = new NormInfDistance(nDimensions); + else if (sscanf(vect_distance.c_str(), "norm-%f", &p) == 1 && p > 0) + d = new NormPDistance(nDimensions, p); + else if (vect_distance != "euclidean") + fprintf(stderr, "Warning: incorrect vect_distance: %s (falling back to euclidean)\n", vect_distance.c_str()); + if (d == 0) + d = new EuclideanDistance(nDimensions); som map = { nSomX, nSomY, @@ -105,7 +116,7 @@ void train(float *data, int data_length, unsigned int nEpoch, nVectors, mapType, gridType, - *(new EuclideanDistance(nDimensions)), + *d, uMatrix, codebook, globalBmus};