Ignore:
Timestamp:
12/10/20 22:33:22 (3 years ago)
Author:
oriona
Message:

Similarity measures code refactored. Distribution-based similarity measure added.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • cpp/frams/_demos/simil_test.cpp

    r1005 r1044  
    55
    66#include <vector>
     7#include <string>
    78#include "common/loggers/loggertostdout.h"
    89#include "frams/_demos/genotypeloader.h"
    910#include "frams/genetics/preconfigured.h"
    1011#include "common/virtfile/stdiofile.h"
    11 #include "frams/model/similarity/simil_model.h"
    12 
    13 
     12#include "frams/model/similarity/measure-distribution.h"
     13#include "frams/model/similarity/measure-greedy.h"
     14#include "frams/model/similarity/measure-hungarian.h"
     15
     16using namespace std;
     17
     18int add_double_param(std::vector<string> *args, int pos, std::vector<double> *params, std::vector<string> *params_names)
     19{
     20    for (unsigned int i = 0; i < params_names->size(); i++)
     21    {
     22        try
     23        {
     24            params->push_back(std::stod(args->at(pos)));
     25            pos++;
     26        }
     27        catch (const std::invalid_argument&)
     28        {
     29            printf("%s should be a number\n", params_names->at(i).c_str());
     30            return -1;
     31        }
     32        catch (const std::out_of_range&)
     33        {
     34            printf("%s should be inside double range\n", params_names->at(i).c_str());
     35            return -1;
     36        }
     37    }
     38    return 0;
     39}
    1440
    1541/** Computes a matrix of distances between all genotypes in the specified
     
    1844int main(int argc, char *argv[])
    1945{
    20         LoggerToStdout messages_to_stdout(LoggerBase::Enable);
    21         int iCurrParam = 0; // index of the currently processed parameter
    22         char *szCurrParam = NULL;
    23         ModelSimil M; // similarity computing object
    24         bool bPrintNames = false; // specifies if names of genotypes are to be printed
    25         int nResult = 0; // a temporary result
    26 
    27         if (argc < 8)
    28         {
    29                 printf("Too few parameters!\n");
    30                 printf("Command line: [-names] <genotypesFile> <measure> <w_dP> <w_dDEG> <w_dNEU> <w_dGEO> <fixZaxis?>\n\n");
    31 
    32                 printf("Parameters:\n");
    33                 printf("  <genotypesFile> name of a file with genotypes\n");
    34                 printf("  <measure> similarity measure\n");
    35                 printf("  <w_dP> weight of the difference in the number of parts\n");
    36                 printf("  <w_dDEG> weight of the difference in degrees of matched parts\n");
    37                 printf("  <w_dNEU> weight of the difference in neurons of matched parts\n");
    38                 printf("  <w_dGEO> weight of the distance of matched parts\n");
    39                 printf("  <fixZaxis?> should the 'z' (vertical) coordinate be fixed during the alignment? (0 or 1)\n\n");
    40 
    41                 printf("Switches:\n");
    42                 printf("  -names specifies that the number and names of genotypes are to be printed to output\n");
    43                 printf("  before the distance matrix; by default the number and names are not printed\n\n");
    44 
    45                 printf("Outputs a symmetric distance matrix in the format:\n");
    46                 printf("  <row_1> (columns in a row are separated by TABs)\n");
    47                 printf("  ...\n");
    48                 printf("  <row_n>\n");
    49 
    50                 return -1;
    51         }
    52 
    53         // prepare output parameters from .gen file
    54         vector<Geno *> pvGenos;
    55         vector<char *> pvNames;
    56 
    57         // check if there is a switch
    58         iCurrParam = 1;
    59         szCurrParam = argv[iCurrParam];
    60         if (strcmp(szCurrParam, "-names") == 0)
    61         {
    62                 // switch "-names" was given; print names also
    63                 bPrintNames = true;
    64                 // pass to the next parameter
    65                 iCurrParam++;
    66         }
    67 
    68         // check the parameters
    69         // get <genotypesFile> name from command line
    70         char *szFileName = argv[iCurrParam];
    71 
    72         // initially set measure components' weights to invalid values (negative)
    73         for (int i = 0; i < M.GetNOFactors(); i++)
    74         {
    75                 M.m_adFactors[i] = -1.0;
    76         }
    77 
    78         iCurrParam++;
    79         szCurrParam = argv[iCurrParam];
    80         int measure_type = -1;
    81         nResult = sscanf(szCurrParam, "%d", &measure_type);
    82         if (nResult != 1)
    83         {
    84                 printf("Measure type should be a number!\n");
    85                 return -1;
    86         }
    87 
    88         if (measure_type != 0 && measure_type != 1)
    89         {
    90                 printf("Measure type should be 0 (flexible criteria order and optimal matching) or 1 (vertex degree order and greedy matching)!\n");
    91                 return -1;
    92         }
    93 
    94         M.matching_method = measure_type;
    95 
    96         const char *params[] = { "<w_dP>", "<w_dDEG>", "<w_dNEU>", "<w_dGEO>" };
    97         for (int i = 0; i < M.GetNOFactors(); i++)
    98         {
    99                 iCurrParam++;
    100                 szCurrParam = argv[iCurrParam];
    101                 nResult = sscanf(szCurrParam, "%lf", &M.m_adFactors[i]);
    102                 if (nResult != 1)
    103                 {
    104                         // <w_dX> is not a number -- error
    105                         printf("%s", params[i]);
    106                         printf(" should be a number\n");
    107                         return -1;
    108                 }
    109                 else
    110                 {
    111                         // <w_dX> is a number; check if nonnegative
    112                         if (M.m_adFactors[i] < 0.0)
    113                         {
    114                                 printf("%s", params[i]);
    115                                 printf(" should be a nonnegative number\n");
    116                                 return -1;
    117                         }
    118                 }
    119         }
    120 
    121         iCurrParam++;
    122         szCurrParam = argv[iCurrParam];
    123         nResult = sscanf(szCurrParam, "%d", &M.fixedZaxis);
    124         if (nResult != 1)
    125         {
    126                 // <isZFixed> is not a number -- error
    127                 printf("<isZFixed> should be a number\n");
    128                 return -1;
    129         }
    130         else if (M.fixedZaxis != 0 && M.fixedZaxis != 1)
    131         {
    132                 printf("<isZFixed>=%d. <isZFixed> should be equal to 0 or 1\n", M.fixedZaxis);
    133                 return -1;
    134         }
    135 
    136         // read the input file
    137         // prepare loading of genotypes from a .gen file
    138         // create some basic genotype converters
    139         PreconfiguredGenetics genetics;
    140         StdioFileSystem_autoselect stdiofilesys;
    141 
    142         long count = 0, totalsize = 0;
    143         GenotypeMiniLoader loader(szFileName);
    144         GenotypeMini *loaded;
    145         while (loaded = loader.loadNextGenotype())
    146         {
    147                 // while a valid genotype was loaded
    148                 count++;
    149                 totalsize += loaded->genotype.length();
    150                 // create a Geno object based on the MiniGenotype
    151                 Geno *pNextGenotype = new Geno(loaded->genotype);
    152                 if ((pNextGenotype != NULL) && (pNextGenotype->isValid()))
    153                 {
    154                         pvGenos.push_back(pNextGenotype);
    155                         char *szNewName = new char[loaded->name.length() + 1];
    156                         strcpy(szNewName, loaded->name.c_str());
    157                         pvNames.push_back(szNewName);
    158                 }
    159                 else
    160                 {
    161                         printf("Genotype %2li is not valid\n", count);
    162                         if (pNextGenotype != NULL) delete pNextGenotype;
    163                 }
    164         }
    165         if (loader.getStatus() == GenotypeMiniLoader::OnError)
    166         {
    167                 printf("Error: %s", loader.getError().c_str());
    168         }
    169 
    170         double dSimilarity = 0.0;
    171         double **aaSimil = NULL; // array of similarities
    172 
    173         // create an empty array of similarities
    174         aaSimil = new double*[pvGenos.size()];
    175         for (unsigned int k = 0; k < pvGenos.size(); k++)
    176         {
    177                 aaSimil[k] = new double[pvGenos.size()];
    178                 for (unsigned int l = 0; l < pvGenos.size(); l++)
    179                         aaSimil[k][l] = 0.0;
    180         }
    181 
    182         // compute and store similarities
    183         for (unsigned int i = 0; i < pvGenos.size(); i++)
    184         {
    185                 for (unsigned int j = 0; j < pvGenos.size(); j++)
    186                 {
    187                         dSimilarity = M.EvaluateDistance(pvGenos.operator[](i), pvGenos.operator[](j));
    188                         aaSimil[i][j] = dSimilarity;
    189                 }
    190         }
    191 
    192         if (bPrintNames)
    193         {
    194                 // if the "-names" switch was given, print the number of genotypes and their names
    195                 printf("%li\n", pvGenos.size());
    196                 for (unsigned int iGen = 0; iGen < pvNames.size(); iGen++)
    197                 {
    198                         printf("%s\n", pvNames.at(iGen));
    199                 }
    200         }
    201 
    202         // print out the matrix of similarities
    203         for (unsigned int i = 0; i < pvGenos.size(); i++)
    204         {
    205                 for (unsigned int j = 0; j < pvGenos.size(); j++)
    206                 {
    207                         printf("%.2lf\t", aaSimil[i][j]);
    208                 }
    209                 printf("\n");
    210         }
    211 
    212         // delete vectors and arrays
    213         for (unsigned int i = 0; i < pvGenos.size(); i++)
    214         {
    215                 delete pvGenos.operator[](i);
    216                 delete[] pvNames.operator[](i);
    217                 delete[] aaSimil[i];
    218         }
    219 
    220         delete[] aaSimil;
    221 
    222         return 0;
     46    typedef double *pDouble;
     47    LoggerToStdout messages_to_stdout(LoggerBase::Enable);
     48    SimilMeasure *simil_measure = nullptr;
     49    if (argc < 5)
     50    {
     51        printf("Too few parameters!\n");
     52        printf("Command line: [-names] <genotypesFile> <measure (greedy/hungarian)> <w_dP> <w_dDEG> <w_dNEU> <w_dGEO> <fixZaxis?>\n\n");
     53        printf("Command line: [-names] <genotypesFile> <measure (distribution)> <desc> <simil> <dens> <bins> <samp_num>\n\n");
     54        printf("Parameters:\n");
     55        printf("  <genotypesFile> name of a file with genotypes\n");
     56        printf("  <measure> similarity measure name (greedy/hungarian/distribution)\n");
     57        printf("\n");
     58        printf("Parameters of greedy and hungarian measures:\n");
     59        printf("  <w_dP> weight of the difference in the number of parts\n");
     60        printf("  <w_dDEG> weight of the difference in degrees of matched parts\n");
     61        printf("  <w_dNEU> weight of the difference in neurons of matched parts\n");
     62        printf("  <w_dGEO> weight of the distance of matched parts\n");
     63        printf("  <fixZaxis?> should the 'z' (vertical) coordinate be fixed during the alignment? (0 or 1)\n\n");
     64        printf("Parameters of distribution measure:\n");
     65        printf("  <dens> sampling density\n");
     66        printf("  <bins> number of histogram bins\n");
     67        printf("  <samp_num> number of samples taken\n\n");
     68
     69        printf("Switches:\n");
     70        printf("  -names specifies that the number and names of genotypes are to be printed to output\n");
     71        printf("  before the distance matrix; by default the number and names are not printed\n\n");
     72
     73        printf("Outputs a symmetric distance matrix in the format:\n");
     74        printf("  <row_1> (columns in a row are separated by TABs)\n");
     75        printf("  ...\n");
     76        printf("  <row_n>\n");
     77
     78        return -1;
     79    }
     80
     81    std::vector<string> args;
     82    for (int i = 1; i < argc; i++)
     83        args.push_back(std::string(argv[i]));
     84
     85    bool print_names = false;
     86
     87    int pos = 1;
     88    if (args.at(0).compare("-names")==0)
     89    {
     90        print_names = true;
     91        pos = 2;
     92    }
     93
     94    string measure_name = args.at(pos);
     95    pos++;
     96    std::vector<double> params;
     97
     98    if (measure_name.compare("greedy")==0 || measure_name.compare("hungarian")==0)
     99    {
     100        std::vector<string> params_names{ "<w_dP>", "<w_dDEG>", "<w_dNEU>", "<w_dGEO>",  "<fixZaxis?>" };
     101
     102        if (add_double_param(&args, pos, &params, &params_names) == -1)
     103            return -1;
     104
     105        if (measure_name.compare("greedy")==0)
     106            simil_measure = new SimilMeasureGreedy();
     107        else
     108            simil_measure = new SimilMeasureHungarian();
     109    }
     110
     111
     112    else if (measure_name.compare("distribution")==0)
     113    {
     114        std::vector<string> params_names{ "<dens>", "<bins>", "<samp_num>" };
     115
     116        if (add_double_param(&args, pos, &params, &params_names)==-1)
     117            return -1;
     118
     119        simil_measure = new SimilMeasureDistribution();
     120    }
     121
     122    else
     123    {
     124        printf("Measure type should be greedy (flexible criteria order and optimal matching), hungarian (vertex degree order and greedy matching) or distribution!\n");
     125        return -1;
     126    }
     127
     128    simil_measure->setParams(params);
     129
     130    // read the input file
     131    // prepare loading of genotypes from a .gen file
     132    // create some basic genotype converters
     133    PreconfiguredGenetics genetics;
     134    StdioFileSystem_autoselect stdiofilesys;
     135
     136    // prepare output parameters from .gen file
     137    vector<Geno *> pvGenos;
     138    vector<char *> pvNames;
     139
     140    long count = 0, totalsize = 0;
     141    GenotypeMiniLoader loader(args.at(0).c_str());
     142    GenotypeMini *loaded;
     143    while (loaded = loader.loadNextGenotype())
     144    {
     145        // while a valid genotype was loaded
     146        count++;
     147        totalsize += loaded->genotype.length();
     148        // create a Geno object based on the MiniGenotype
     149        Geno *pNextGenotype = new Geno(loaded->genotype);
     150        if ((pNextGenotype != NULL) && (pNextGenotype->isValid()))
     151        {
     152            pvGenos.push_back(pNextGenotype);
     153            char *szNewName = new char[loaded->name.length() + 1];
     154            strcpy(szNewName, loaded->name.c_str());
     155            pvNames.push_back(szNewName);
     156        }
     157        else
     158        {
     159            printf("Genotype %2li is not valid\n", count);
     160            if (pNextGenotype != NULL) delete pNextGenotype;
     161        }
     162    }
     163    if (loader.getStatus() == GenotypeMiniLoader::OnError)
     164    {
     165        printf("Error: %s", loader.getError().c_str());
     166    }
     167
     168    double dSimilarity = 0.0;
     169    double **aaSimil = NULL; // array of similarities
     170
     171    // create the empty array of similarities
     172    aaSimil = new pDouble[pvGenos.size()];
     173    for (unsigned int k = 0; k < pvGenos.size(); k++)
     174    {
     175        aaSimil[k] = new double[pvGenos.size()];
     176        for (unsigned int l = 0; l < pvGenos.size(); l++)
     177            aaSimil[k][l] = 0.0;
     178    }
     179
     180
     181
     182    // compute and remember similarities
     183    for (unsigned int i = 0; i < pvGenos.size(); i++)
     184    {
     185        for (unsigned int j = 0; j < pvGenos.size(); j++)
     186        {
     187            dSimilarity = simil_measure->evaluateDistance(pvGenos.operator[](i), pvGenos.operator[](j));
     188            aaSimil[i][j] = dSimilarity;
     189        }
     190    }
     191
     192    if (print_names)
     193    {
     194        // if "-names" switch was given, print the number of genotypes and their names
     195        printf("%li\n", pvGenos.size());
     196        for (unsigned int iGen = 0; iGen < pvNames.size(); iGen++)
     197        {
     198            printf("%s\n", pvNames.at(iGen));
     199        }
     200    }
     201
     202    // print out the matrix of similarities
     203    for (unsigned int i = 0; i < pvGenos.size(); i++)
     204    {
     205        for (unsigned int j = 0; j < pvGenos.size(); j++)
     206        {
     207            printf("%.2lf\t", aaSimil[i][j]);
     208        }
     209        printf("\n");
     210    }
     211
     212    // delete vectors and arrays
     213    for (unsigned int i = 0; i < pvGenos.size(); i++)
     214    {
     215        delete pvGenos.operator[](i);
     216        delete[] pvNames.operator[](i);
     217        delete[] aaSimil[i];
     218    }
     219
     220    delete[] aaSimil;
     221    delete simil_measure;
     222
     223    return 0;
    223224}
Note: See TracChangeset for help on using the changeset viewer.