Changeset 1230 for cpp/frams/genetics


Ignore:
Timestamp:
05/01/23 02:14:27 (19 months ago)
Author:
Maciej Komosinski
Message:

Got rid of the (buggy) look-ahead function, made parsing stricter and simpler

Location:
cpp/frams/genetics/f4
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • cpp/frams/genetics/f4/f4_general.cpp

    r1229 r1230  
    3131{
    3232        *v += 0.7853;  // 0.7853981  45 degrees
    33 }
    34 
    35 int scanRecur(const char* s, int slen, char stopchar)
    36 {
    37         int i = 0;
    38         //DB( printf("    scan('%s', '%c')\n", s, stopchar); )
    39         while (1)
    40         {
    41                 if (i >= slen)  // ran out the string, should never happen with a correct string
    42                         return 1; //TODO MacKo 2023-04: interesting: why was this situation made undistinguishable from s[1]==stopchar ? does this have any bad consequences or is "1" just used to tell "advance as little as possible"? Anyway, this function can be eliminated when parsing is simplified.
    43                 if (stopchar == s[i])  // bumped into stopchar
    44                         return int(i);
    45                 if (i < slen - 1) // s[i] is not the last char
    46                 {
    47                         if (s[i] == '(') //not an allowed char in f4, perhaps a remnant of old experiments with code
    48                         {
    49                                 i += 2 + scanRecur(s + i + 1, slen - i - 1, ')');
    50                                 continue;
    51                         }
    52                         if (s[i] == '<')
    53                         {
    54                                 i += 2 + scanRecur(s + i + 1, slen - i - 1, '>');
    55                                 continue;
    56                         }
    57                         if (s[i] == '#')
    58                         {
    59                                 i += 2 + scanRecur(s + i + 1, slen - i - 1, '>');
    60                                 continue;
    61                         }
    62                 }
    63                 // s[i] is a non-special character
    64                 i++;
    65         }
    66         return i;
    6733}
    6834
     
    703669        // transform geno from string to nodes
    704670        f4rootnode = new f4_Node();
    705         int res = f4_processRecur(genome.c_str(), 0, f4rootnode);
     671        int _ = 0;
     672        int res = f4_processRecur(genome.c_str(), _, f4rootnode);
    706673        if (res || (f4rootnode->childCount() != 1))
    707674        {
     
    13291296// scan genotype string and build tree
    13301297// return >1 for error (errorpos)
    1331 int f4_processRecur(const char* genot, unsigned int pos0, f4_Node *parent)
    1332 {
    1333         unsigned int gpos = pos0; //MacKo 2023-04 (TODO): these two variables are often updated before return which has no effect since they are local. Seems like a half step towards making them (or just gpos) in/out parameter which would solve many issues and simplify parsing (getting rid of scanRecur()) while making it more strict.
     1298int f4_processRecur(const char* genot, int &pos_inout, f4_Node *parent)
     1299{
    13341300        f4_Node *par = parent;
    13351301
    1336         if (gpos >= strlen(genot))
     1302        if (pos_inout >= (int)strlen(genot))
    13371303                return (int)strlen(genot) + 1;
    13381304
    1339         while (gpos < strlen(genot))
    1340         {
    1341                 // first switch across cell dividers and old semantics
    1342                 switch (genot[gpos])
     1305        while (pos_inout < (int)strlen(genot))
     1306        {
     1307                //#define PRINT_PARSING_LOCATION
     1308#ifdef PRINT_PARSING_LOCATION
     1309                printf("%s\n", genot);
     1310                for (int i = 0; i < pos_inout; i++) printf(" ");
     1311                printf("^\n");
     1312#endif
     1313                switch (genot[pos_inout])
    13431314                {
    13441315                case '<':
    13451316                {
    1346                         // find out genotype start for child
    1347                         int stopchar_offset = scanRecur(genot + gpos + 1, (int)strlen(genot + gpos + 1), '>');
    1348 
    1349                         f4_Node *node = new f4_Node("<", par, gpos);
     1317                        f4_Node *node = new f4_Node("<", par, pos_inout);
    13501318                        par = node;
    1351                         int res = f4_processRecur(genot, gpos + 1, par);
     1319                        pos_inout++; //move after '<'
     1320                        int res = f4_processRecur(genot, pos_inout, par);
    13521321                        if (res) return res;
    1353                         if (gpos + stopchar_offset + 2 < strlen(genot))
    1354                         {
    1355                                 res = f4_processRecur(genot, gpos + stopchar_offset + 2, par);
     1322                        if (pos_inout < (int)strlen(genot))
     1323                        {
     1324                                res = f4_processRecur(genot, pos_inout, par);
    13561325                                if (res) return res;
    13571326                        }
     
    13591328                        {
    13601329                                //MacKo 2023-04, more strict behavior: instead of silent repair (no visible effect to the user, genotype stays invalid but is interpreted and reported as valid), we now point out where the error is. For example <X> or <X><X or <X><N:N>
    1361                                 return gpos + 1; //the problem starts here, occurs because second child (branch) <1..>2..> is not completed
     1330                                return (int)strlen(genot) + 1;
    13621331                                //old silent repair:
    13631332                                //node = new f4_Node(">", par, int(strlen(genot)) - 1);
    1364                                 //par = node;
    1365                         }
    1366                         gpos++;
     1333                        }
    13671334                        return 0;  // OK
    13681335                }
    13691336                case '>':
    13701337                {
    1371                         f4_Node *node = new f4_Node(">", par, gpos);
    1372                         par = node;
    1373                         //gpos = (unsigned int)strlen(genot); //MacKo 2023-04: first of all, 'gpos' is a local variable so no effect; second, '>' may be internal (i.e., not the last one in the genotype), so it is a bad hint to assign strlen(). 'par' above is also local...
     1338                        new f4_Node(">", par, pos_inout);
     1339                        pos_inout++; //move after '>'
    13741340                        return 0;  // OK
    13751341                }
     
    13781344                        // repetition marker, 1 by default
    13791345                        ExtValue val;
    1380                         const char* end = val.parseNumber(genot + gpos + 1, ExtPType::TInt);
     1346                        const char* end = val.parseNumber(genot + pos_inout + 1, ExtPType::TInt);
     1347                        //TODO end==NULL? -> error!
    13811348                        int reps = (end == NULL) ? 1 : val.getInt();
    1382                         // find out genotype start for continuation
    1383                         int stopchar_offset = scanRecur(genot + gpos + 1, (int)strlen(genot + gpos + 1), '>');
     1349                        f4_Node *node = new f4_Node("#", par, pos_inout);
     1350                        node->reps = reps;
    13841351                        // skip number
    1385                         unsigned int oldpos = gpos;
    1386                         gpos += end - (genot + gpos);
    1387                         //gpos++;
    1388                         //while ((genot[gpos] >= '0') && (genot[gpos] <= '9')) gpos++; node1 = new f4_Node("#", par, oldpos);
    1389                         f4_Node *node = new f4_Node("#", par, oldpos);
    1390                         node->reps = reps;
    1391                         par = node;
    1392                         int res = f4_processRecur(genot, gpos, node);
     1352                        pos_inout += end - (genot + pos_inout);
     1353                        int res = f4_processRecur(genot, pos_inout, node);
    13931354                        if (res) return res;
    1394                         if (oldpos + stopchar_offset + 2 < strlen(genot))
    1395                         {
    1396                                 res = f4_processRecur(genot, oldpos + stopchar_offset + 2, node);
     1355                        if (pos_inout < (int)strlen(genot))
     1356                        {
     1357                                res = f4_processRecur(genot, pos_inout, node);
    13971358                                if (res) return res;
    13981359                        }
    13991360                        else // ran out
    14001361                        {
    1401                                 return gpos; //MacKo 2023-04: report an error, better to be more strict instead of a silent repair (genotype stays invalid but is interpreted and reported as valid) with non-obvious consequences?
     1362                                return (int)strlen(genot) + 1; //MacKo 2023-04: report an error, better to be more strict instead of a silent repair (genotype stays invalid but is interpreted and reported as valid) with non-obvious consequences?
    14021363                                //earlier apporach - silently treating this problem (we don't ever see where the error is because it gets corrected in some way here, while parsing the genotype, and error location in the genotype is never reported):
    14031364                                //node = new f4_Node(">", par, int(strlen(genot)) - 1); // check if needed and if this is really the best repair operation; seemed to happen too many times in succession for some genotypes even though they were only a result of f4 operators, not manually created... and the operators should not generate invalid genotypes, right? Or maybe crossover does? Seems like too many #N's for closing >'s; removing #N or adding > helped. Operators somehow don't do it properly sometimes? But F4_ADD_REP adds '>'... (TODO)
     
    14111372                {
    14121373                        // whitespace: ignore
    1413                         gpos++;
     1374                        pos_inout++;
    14141375                        break;
    14151376                }
    14161377                case 'N':
    14171378                {
    1418                         int forgenorange = gpos;
    1419                         if (genot[gpos + 1] != ':')
    1420                                 return gpos + 1; //error
    1421                         gpos += 2; //skipping "N:"
    1422                         unsigned int begin_index = gpos;
    1423                         char* end = (char*)genot + begin_index;
    1424                         NeuroClass *neuclass = GenoOperators::parseNeuroClass(end, ModelEnum::SHAPETYPE_BALL_AND_STICK);
     1379                        int forgenorange = pos_inout;
     1380                        if (genot[pos_inout + 1] != ':')
     1381                                return pos_inout + 1; //error
     1382                        pos_inout += 2; //skipping "N:"
     1383                        unsigned int neuroclass_begin = pos_inout;
     1384                        char* neuroclass_end = (char*)genot + neuroclass_begin;
     1385                        NeuroClass *neuclass = GenoOperators::parseNeuroClass(neuroclass_end, ModelEnum::SHAPETYPE_BALL_AND_STICK); //advances neuroclass_end
    14251386                        if (neuclass == NULL)
    1426                                 return gpos + 1; //error
    1427                         gpos += end - genot - begin_index;
    1428                         string neutype = string(genot + begin_index, genot + gpos);
     1387                                return pos_inout + 1; //error
     1388                        pos_inout += neuroclass_end - genot - neuroclass_begin;
     1389                        string neutype = string(genot + neuroclass_begin, genot + pos_inout);
    14291390                        f4_Node *node = new f4_Node(neutype, par, forgenorange);
    14301391                        node->neuclass = neuclass;
     
    14381399                        // in the future this could be generalized to all neuron properties, for example N:|:power:0.6:range:1.4, or can even use '=' or ',' instead of ':' if no ambiguity
    14391400                        char prop_dir, prop_symbol, prop_end[2]; // prop_end is only to ensure that neuron parameter definition is completed
    1440                         if (sscanf(genot + gpos, ":%c%c%1[:]", &prop_dir, &prop_symbol, &prop_end) != 3)
     1401                        if (sscanf(genot + pos_inout, ":%c%c%1[:]", &prop_dir, &prop_symbol, &prop_end) != 3)
    14411402                                // error: incorrect format
    1442                                 return gpos + 1 + 1;
     1403                                return pos_inout + 1 + 1;
    14431404                        if (prop_dir != '-' && prop_dir != '+')
    1444                                 return gpos + 1 + 1; //error
     1405                                return pos_inout + 1 + 1; //error
    14451406                        switch (prop_symbol)
    14461407                        {
    14471408                        case '!':  case '=':  case '/':  break;
    14481409                        default:
    1449                                 return gpos + 1 + 1; //error
    1450                         }
    1451                         f4_Node *node = new f4_Node(":", par, gpos);
     1410                                return pos_inout + 1 + 1; //error
     1411                        }
     1412                        f4_Node *node = new f4_Node(":", par, pos_inout);
    14521413                        node->prop_symbol = prop_symbol;
    14531414                        node->prop_increase = prop_dir == '+' ? true : false; // + or -
    14541415                        par = node;
    1455                         int stopchar_offset = scanRecur(genot + gpos + 1, (int)strlen(genot + gpos + 1), ':');
    1456                         gpos += stopchar_offset + 2;
     1416                        pos_inout += 4; //skipping :ds:
    14571417                        break;
    14581418                }
     
    14611421                        double weight = 0;
    14621422                        int relfrom;
    1463                         const char *end = parseConnection(genot + gpos, relfrom, weight);
     1423                        const char *end = parseConnection(genot + pos_inout, relfrom, weight);
    14641424                        if (end == NULL)
    1465                                 return gpos + 1; //error
    1466 
    1467                         f4_Node *node = new f4_Node("[", par, gpos);
     1425                                return pos_inout + 1; //error
     1426
     1427                        f4_Node *node = new f4_Node("[", par, pos_inout);
    14681428                        node->conn_from = relfrom;
    14691429                        node->conn_weight = weight;
    14701430                        par = node;
    1471                         int stopchar_offset = scanRecur(genot + gpos + 1, (int)strlen(genot + gpos + 1), ']');
    1472                         gpos += stopchar_offset + 2;
     1431                        pos_inout += end - (genot + pos_inout);
    14731432                        break;
    14741433                }
    14751434                default: // 'X' and ',' and all modifiers and also invalid symbols - add a node, for invalid symbols build will give the error or repair
    14761435                {
    1477                         //printf("any regular character '%c'\n", genot[gpos]);
    1478                         f4_Node *node = new f4_Node(genot[gpos], par, gpos);
     1436                        //printf("any regular character '%c'\n", genot[pos_inout]);
     1437                        //TODO here: read a continuous sequence of modifiers, sort and optimize ("collapse") it like in f1, then add to tree
     1438                        f4_Node *node = new f4_Node(genot[pos_inout], par, pos_inout);
    14791439                        par = node;
    1480                         gpos++;
     1440                        pos_inout++;
    14811441                        break;
    14821442                }
     
    14871447        if (par && par->name != ">")
    14881448        {
    1489                 //happens when gpos == strlen(genot)
    1490                 //return gpos; //MacKo 2023-04: could report an error instead of silent repair, but repair operators only work in Cells (i.e., after the f4_Node tree has been parsed without errors and Cells can start developing) so we don't want to make a fatal error because of missing '>' here. Also after conversions from Cells to text, trailing '>' is deliberately removed... and also the simplest genotype is officially X, not X>.
    1491                 f4_Node *node = new f4_Node('>', par, int(strlen(genot)) - 1);
    1492                 par = node;
    1493         }
    1494 
    1495         return 0;
     1449                //happens when pos_inout == strlen(genot)
     1450                //return pos_inout; //MacKo 2023-04: could report an error instead of silent repair, but repair operators only work in Cells (i.e., after the f4_Node tree has been parsed without errors and Cells can start developing) so we don't want to make a fatal error because of missing '>' here. Also after conversions from Cells to text, trailing '>' is deliberately removed... and also the simplest genotype is officially X, not X>.
     1451                new f4_Node('>', par, int(strlen(genot)) - 1);
     1452        }
     1453
     1454        return 0;  // OK
    14961455}
    14971456
  • cpp/frams/genetics/f4/f4_general.h

    r1229 r1230  
    4040#define CELL_NEURON 42 ///<differentiated to neuron, can divide
    4141//@}
    42 
    43 /**
    44  * TODO MacKo 2023-04: not sure if this function is needed and if f4_processRecur() would not suffice
    45  * if it advanced the string pointer (in/out parameter) while processing. Its returned value is always used after
    46  * f4_processRecur() anyway, and in two cases likely incorrectly (for [...] to detect closing ']'
    47  * and for :...: to detect closing ':') - we don't need recursion in these cases, a simple linear
    48  * scan would suffice, but even this would not be needed - since we are parsing the actual characters in these cases,
    49  * we do scanning anyway. So looks like this function doubles the work already done more thoroughly by f4_processRecur().
    50  *
    51  * Scans f4 genotype string for a stopping character and returns the position of
    52  * this stopping character or 1 if the end of string was reached. This method is used
    53  * for closing braces, like ), >, ]. It runs recursively when opening braces
    54  * like (, <, # are found.
    55  * @param s string with the f4 genotype
    56  * @param slen length of a given string
    57  * @param stopchar character to be found
    58  * @return 1 if end of string was reached, or position of found character in sequence
    59  */
    60 int scanRecur(const char* s, int slen, char stopchar);
    6142
    6243
     
    501482 * @return 0 if processing was successful, otherwise returns the position of an error in the genotype
    502483 */
    503 int f4_processRecur(const char *genot, unsigned int pos0, f4_Node *parent);
     484int f4_processRecur(const char *genot, int &pos_inout, f4_Node *parent);
    504485
    505486/**
  • cpp/frams/genetics/f4/f4_oper.cpp

    r1229 r1230  
    1616//
    1717// TODO the behavior of neuron input indexes during mutation seems badly implemented (see also TREAT_BAD_CONNECTIONS_AS_INVALID_GENO). Are they kept properly maintained when nodes are added and removed? This could be done well because during mutation we operate on the tree structure with cross-references between nodes (so they should not be affected by local changes in the tree), and then convert the tree back to string. Yet, the f4_Node.conn_from is an integer and these fields in nodes do not seem to be maintained on tree node adding/removal... change these integer offsets to references to node objects? But actually, do the offsets that constitute relative connection references concern the f4_Node tree structure (and all these sophisticated calculations of offsets during mutation are useful) or rather they concern the f4_Cells development? verify all situations in f4_Cell::oneStep(), case '['.
    18 // TODO add simplifying sequences of modifiers (so capital and small letter cancel out, like in f1) - but seems like each single modifier is a separate f4_Node? and perhaps we don't want to use the repair mechanism for this... maybe mutations, when they add/modify/remove a modifier node, should be "cleaning" the tree by removing nodes when they encounter contradictory modifiers on the same subpath, and also limit the number of modifiers of each type just like in f1? To avoid squences like ...<X>llmlIilImmimiimmimifmfl<fifmmimilimmmiimiliffmfliIfififlliflimfliffififmiffmfliflifmIlimimiflimfiffmllliflmimifllifliliflifmIlimimiflimfiffmllliflmimifllfmIlimimiflimfiffmllliflmimiflliflimimmiflimfliffmiflifmfiffllIlififliffififmiffmfliflifIliflimimflimflfflimimifllfflifllfflimlififfiiffifIr<r<...
     18// TODO add simplifying sequences of modifiers (so capital and small letter cancel out, like in f1) - but seems like each single modifier is a separate f4_Node? and perhaps we don't want to use the repair mechanism for this... maybe mutations, when they add/modify/remove a modifier node, should be "cleaning" the tree by removing nodes when they encounter contradictory modifiers on the same subpath, and also limit the number of modifiers of each type just like in f1? To avoid sequences like ...<X>llmlIilImmimiimmimifmfl<fifmmimilimmmiimiliffmfliIfififlliflimfliffififmiffmfliflifmIlimimiflimfiffmllliflmimifllifliliflifmIlimimiflimfiffmllliflmimifllfmIlimimiflimfiffmllliflmimiflliflimimmiflimfliffmiflifmfiffllIlififliffififmiffmfliflifIliflimimflimflfflimimifllfflifllfflimlififfiiffifIr<r<...
    1919// TODO add support for properties of (any class of) neurons - not just sigmoid/force/intertia (':' syntax) for N
    2020// TODO add mapping genotype character ranges for neural [connections]
     21// TODO change the default branching plane (to match f1) so they do not grow perfectly vertical (cheating vertpos) so easily? (so they require Rr or other modifiers)
    2122
    2223
     
    122123        // convert geno to a tree, then try to validate
    123124        f4_Node root;
    124         if (f4_processRecur(geno, 0, &root) || root.childCount() != 1) return GENOPER_OK; // cannot repair
     125        int _ = 0;
     126        if (f4_processRecur(geno, _, &root) || root.childCount() != 1) return GENOPER_OK; // cannot repair
    125127
    126128        const int VALIDATE_TRIALS = 20;
     
    137139{
    138140        f4_Node root;
    139         int res = f4_processRecur(geno, 0, &root);
     141        int _ = 0;
     142        int res = f4_processRecur(geno, _, &root);
    140143        if (res) return res;  // errorpos, >0
    141144        if (root.childCount() != 1) return 1; //earlier: GENOPER_OPFAIL
     
    549552{
    550553        f4_Node *root = new f4_Node;
    551         if (f4_processRecur(g, 0, root) || root->childCount() != 1)
     554        int _ = 0;
     555        if (f4_processRecur(g, _, root) || root->childCount() != 1)
    552556        {
    553557                delete root;
     
    673677
    674678        // convert genotype strings into tree structures
    675         if (f4_processRecur(g1, 0, &root1) || (root1.childCount() != 1)) return GENOPER_OPFAIL;
    676         if (f4_processRecur(g2, 0, &root2) || (root2.childCount() != 1)) return GENOPER_OPFAIL;
     679        int _1 = 0, _2 = 0;
     680        if (f4_processRecur(g1, _1, &root1) || (root1.childCount() != 1)) return GENOPER_OPFAIL;
     681        if (f4_processRecur(g2, _2, &root2) || (root2.childCount() != 1)) return GENOPER_OPFAIL;
    677682
    678683        // decide amounts of crossover, 0.1-0.9
Note: See TracChangeset for help on using the changeset viewer.