Changeset 1234 for cpp


Ignore:
Timestamp:
05/04/23 01:45:37 (20 months ago)
Author:
Maciej Komosinski
Message:

Simplify sequences of modifier genes, cancelling out antagonistic ones and limiting the number of identical genes

Location:
cpp/frams/genetics/f4
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • cpp/frams/genetics/f4/f4_general.cpp

    r1232 r1234  
    77
    88#include "f4_general.h"
    9 #include "../genooperators.h" //for GENOPER_ constants
     9#include "../genooperators.h" // for GENOPER_ constants
    1010#include <common/nonstd_stl.h>
    1111#include <common/log.h>
     
    306306                                break;
    307307                        }
    308                         case 'r':  case 'R':
     308                        case 'r':
     309                        case 'R':
    309310                        {
    310311                                // error: if neuron
     
    340341                                // error: if neuron
    341342                                if (type == CELL_NEURON) //some neurons have the same single-letter names as modifiers (for example G,S,D), but they are supposed to have is_neuroclass==true so they should indeed not be handled here
    342                                 {//however, what we see here is actually modifiers such as IdqEbWL (so not valid neuroclasses) that occurred within an already differentiated cell type==CELL_NEURON.
     343                                {//however, what we see here is actually modifiers such as IdqEbWL (so not valid neuroclasses) that occurred within an already differentiated cell of type==CELL_NEURON.
    343344                                        //printf("Handled as a modifier, but type==CELL_NEURON: '%c'\n", name);
    344345                                        // fix: delete it
     
    691692int f4_Cells::simulate()
    692693{
    693         constexpr bool print_debugging = false; //print the state of cells during development
     694        const bool PRINT_CELLS_DEVELOPMENT = false; //print the state of cells
    694695        errorcode = GENOPER_OK;
    695696
    696697        for (int i = 0; i < cell_count; i++)  C[i]->active = true;
    697698
    698         if (print_debugging) f4_Node::print_tree(C[0]->genot, 0);
    699         if (print_debugging) print_cells("Initialization");
     699        if (PRINT_CELLS_DEVELOPMENT) f4_Node::print_tree(C[0]->genot, 0);
     700        if (PRINT_CELLS_DEVELOPMENT) print_cells("Initialization");
    700701
    701702        // execute oneStep() in a cycle
    702         while (oneStep()) if (print_debugging) print_cells("Development step");
    703         if (print_debugging) print_cells("After last development step");
     703        while (oneStep()) if (PRINT_CELLS_DEVELOPMENT) print_cells("Development step");
     704        if (PRINT_CELLS_DEVELOPMENT) print_cells("After last development step");
    704705
    705706#ifdef EXTRA_STEP_CELL_DEVELOPMENT
    706707        if (errorcode == GENOPER_OK)
    707708        {
    708                 oneStep(); if (print_debugging) print_cells("After extra step"); //for these "halted" (yielding) cells (they have active==false) that wait for other cells to develop. Without this step, the last, recently halted one(s) may miss the "retrying" step if all active==true cells became active==false in the last step.
     709                oneStep(); if (PRINT_CELLS_DEVELOPMENT) print_cells("After extra step"); //for these "halted" (yielding) cells (they have active==false) that wait for other cells to develop. Without this step, the last, recently halted one(s) may miss the "retrying" step if all active==true cells became active==false in the last step.
    709710        }
    710711#endif
     
    745746        //DB( printf("Cell simulation done, %d cells. \n", nc); )
    746747
    747         if (print_debugging) print_cells("Final");
     748        if (PRINT_CELLS_DEVELOPMENT) print_cells("Final");
    748749
    749750        return errorcode;
     
    10581059{
    10591060        for (int i = 0; i < indent; i++) printf(" ");
    1060         printf("%s (%d)", root->name.c_str(), root->count());
     1061        printf("%s%s%s (%d)", root->neuclass != NULL ? "N:" : "", root->name.c_str(), root->name == "#" ? std::to_string(root->reps).c_str() : "", root->count() - 1);
    10611062        if (root->name == "[")
    10621063                printf("     from=%-3d  weight=%g", root->conn_from, root->conn_weight);
     
    12531254}
    12541255
    1255 // scan genotype string and build tree
     1256// scan genotype string and build a tree
    12561257// return >1 for error (errorpos)
    1257 int f4_processRecur(const char* genot, int &pos_inout, f4_Node *parent)
    1258 {
     1258int f4_processRecur(const char* genot, const int genot_len, int &pos_inout, f4_Node *parent)
     1259{
     1260        static const char *all_modifiers_no_comma = F14_MODIFIERS; //I did experiments with added comma (see all_modifiers_for_simplify below) which had the advantage of commas not breaking sequences of modifiers, thus longer sequences of modifiers (including commas) could be simplified and genetic bloat was further reduced. But since we impose a limit on the number of modifier chars in GenoOperators::simplifiedModifiers(), it would also influence commas (e.g. no more than 8 commas per sequence), so in order to leave commas entirely unlimited let's exclude them from simplification. Note that currently 'X' or any other non-F14_MODIFIERS char also separates the sequence to be simplified, so if we wanted a really intensive simplification, it should occur during development, when we know precisely which genes influence each f4_Cell.
     1261        //const char *Geno_f4::all_modifiers_for_simplify = F14_MODIFIERS ",\1"; //'\1' added to keep the number of chars even, avoid exceptions in logic and save the simple rule that the sequence is made of pairs (gene,contradictory gene), where a comma has no contradictory gene and \1 is unlikely to occur in the f4 genotype (and not allowed), so no risk it will cancel out a comma during simplification.
     1262
     1263
    12591264        f4_Node *par = parent;
    12601265
    1261         if (pos_inout >= (int)strlen(genot))
    1262                 return (int)strlen(genot) + 1;
    1263 
    1264         while (pos_inout < (int)strlen(genot))
    1265         {
    1266 //#define PRINT_PARSING_LOCATION
    1267 #ifdef PRINT_PARSING_LOCATION
    1268                 printf("%s\n", genot);
    1269                 for (int i = 0; i < pos_inout; i++) printf(" ");
    1270                 printf("^\n");
    1271 #endif
     1266        if (pos_inout >= genot_len)
     1267                return genot_len + 1;
     1268
     1269        while (pos_inout < genot_len)
     1270        {
     1271                const bool PRINT_PARSING_LOCATION = false;
     1272                if (PRINT_PARSING_LOCATION)
     1273                {
     1274                        printf("%s\n", genot);
     1275                        for (int i = 0; i < pos_inout; i++) printf(" ");
     1276                        printf("^\n");
     1277                }
    12721278                switch (genot[pos_inout])
    12731279                {
     
    12771283                        par = node;
    12781284                        pos_inout++; //move after '<'
    1279                         int res = f4_processRecur(genot, pos_inout, par);
     1285                        int res = f4_processRecur(genot, genot_len, pos_inout, par);
    12801286                        if (res) return res;
    1281                         if (pos_inout < (int)strlen(genot))
    1282                         {
    1283                                 res = f4_processRecur(genot, pos_inout, par);
     1287                        if (pos_inout < genot_len)
     1288                        {
     1289                                res = f4_processRecur(genot, genot_len, pos_inout, par);
    12841290                                if (res) return res;
    12851291                        }
     
    12871293                        {
    12881294                                //MacKo 2023-04, more strict behavior: instead of silent repair (no visible effect to the user, genotype stays invalid but is interpreted and reported as valid), we now point out where the error is. For example <X> or <X><X or <X><N:N>
    1289                                 return (int)strlen(genot) + 1;
     1295                                return genot_len + 1;
    12901296                                //old silent repair:
    1291                                 //node = new f4_Node(">", par, int(strlen(genot)) - 1);
     1297                                //node = new f4_Node(">", par, genot_len - 1);
    12921298                        }
    12931299                        return 0;  // OK
     
    13101316                        // skip number
    13111317                        pos_inout += end - (genot + pos_inout);
    1312                         int res = f4_processRecur(genot, pos_inout, node);
     1318                        int res = f4_processRecur(genot, genot_len, pos_inout, node);
    13131319                        if (res) return res;
    1314                         if (pos_inout < (int)strlen(genot))
    1315                         {
    1316                                 res = f4_processRecur(genot, pos_inout, node);
     1320                        if (pos_inout < genot_len)
     1321                        {
     1322                                res = f4_processRecur(genot, genot_len, pos_inout, node);
    13171323                                if (res) return res;
    13181324                        }
    13191325                        else // ran out
    13201326                        {
    1321                                 return (int)strlen(genot) + 1; //MacKo 2023-04: report an error, better to be more strict instead of a silent repair (genotype stays invalid but is interpreted and reported as valid) with non-obvious consequences?
     1327                                return genot_len + 1; //MacKo 2023-04: report an error, better to be more strict instead of a silent repair (genotype stays invalid but is interpreted and reported as valid) with non-obvious consequences?
    13221328                                //earlier apporach - silently treating this problem (we don't ever see where the error is because it gets corrected in some way here, while parsing the genotype, and error location in the genotype is never reported):
    1323                                 //node = new f4_Node(">", par, int(strlen(genot)) - 1); // check if needed and if this is really the best repair operation; seemed to happen too many times in succession for some genotypes even though they were only a result of f4 operators, not manually created... and the operators should not generate invalid genotypes, right? Or maybe crossover does? Seems like too many #N's for closing >'s; removing #N or adding > helped. Operators somehow don't do it properly sometimes? But F4_ADD_REP adds '>'... (TODO)
     1329                                //node = new f4_Node(">", par, genot_len - 1); // check if needed and if this is really the best repair operation; seemed to happen too many times in succession for some genotypes even though they were only a result of f4 operators, not manually created... and the operators should not generate invalid genotypes, right? Or maybe crossover does? Seems like too many #N's for closing >'s; removing #N or adding > helped. Operators somehow don't do it properly sometimes? But F4_ADD_REP adds '>'... (TODO)
    13241330                        }
    13251331                        return 0;  // OK
     
    13911397                        break;
    13921398                }
    1393                 default: // 'X' and ',' and all modifiers and also invalid symbols - add a node, for invalid symbols build will give the error or repair
     1399                default: // 'X' and ',' and all modifiers and also invalid symbols - add a node. For symbols that are not valid in f4, the cell development process will give the error or repair
    13941400                {
    13951401                        //printf("any regular character '%c'\n", genot[pos_inout]);
    1396                         //TODO here: read a continuous sequence of modifiers, sort and optimize ("collapse") it like in f1, then add to tree
     1402#define F4_SIMPLIFY_MODIFIERS //avoid long sequences like ...<X>llmlIilImmimiimmimifmfl<fifmmimilimmmiimiliffmfliIfififlliflimfliffififmiffmflllfflimlififfiiffifIr<r<... - another option, instead of simplifying while parsing here, would be mutations: when they add/modify/remove a modifier node, they could "clean" the tree by removing nodes when they encounter contradictory modifiers on the same subpath, and also limit the number of modifiers just as GenoOperators::simplifiedModifiers() does.
     1403#ifdef F4_SIMPLIFY_MODIFIERS
     1404                        char *ptr = (char*)(genot + pos_inout);
     1405
     1406#ifdef __BORLANDC__ // "[bcc32c Error] cannot compile this non-trivial TLS destruction yet" (C++B 10.4u2)
     1407                        static
     1408#else
     1409                        thread_local
     1410#endif
     1411                                vector<int> modifs_counts(strlen(all_modifiers_no_comma)); ///<an array with a known constant size storing counters of each modifier symbol from all_modifiers_no_comma, created once to avoid reallocation every time when modifier genes are simplified during parsing. Initialization of required size; it will never be resized.
     1412                        std::fill(modifs_counts.begin(), modifs_counts.end(), 0); //zeroing only needed if we encountered a char from all_modifiers_no_comma and enter the 'while' loop below
     1413
     1414                        while (char *m = GenoOperators::strchrn0(all_modifiers_no_comma, *ptr)) //only processes a section of chars known in all_modifiers_no_comma, other characters will exit the loop
     1415                        {
     1416                                modifs_counts[m - all_modifiers_no_comma]++;
     1417                                GenoOperators::skipWS(++ptr); //advance and ignore whitespace
     1418                        }
     1419                        int advanced = ptr - (genot + pos_inout);
     1420                        if (advanced > 0) //found modifiers
     1421                        {
     1422                                string simplified = GenoOperators::simplifiedModifiers(all_modifiers_no_comma, modifs_counts);
     1423                                // add a node for each char in "simplified"
     1424                                for (size_t i = 0; i < simplified.length(); i++)
     1425                                {
     1426                                        int pos = GenoOperators::strchrn0(genot + pos_inout, simplified[i]) - genot; //unnecessarily finding the same char, if it occurrs multiple times in simplified
     1427                                        f4_Node *node = new f4_Node(simplified[i], par, pos); //location is approximate. In the simplification process we don't trace where the origin(s) of the simplified[i] gene were. We provide 'pos' as the first occurrence of simplified[i] (for example, all 'L' will have the same location assigned, but at least this is where 'L' occurred in the genotype, so in case of any modification of a node (repair, removal, whatever... even mapping of genes) the indicated gene will be one of the responsible ones)
     1428                                        par = node;
     1429                                }
     1430                                pos_inout += advanced;
     1431                        }
     1432                        else // genot[pos_inout] is a character not present in all_modifiers_no_comma, so treat it as a regular individual char just as it would be without simplification
     1433                        {
     1434                                f4_Node *node = new f4_Node(genot[pos_inout], par, pos_inout);
     1435                                par = node;
     1436                                pos_inout++;
     1437                        }
     1438#else
    13971439                        f4_Node *node = new f4_Node(genot[pos_inout], par, pos_inout);
    13981440                        par = node;
    13991441                        pos_inout++;
     1442#endif // F4_SIMPLIFY_MODIFIERS
    14001443                        break;
    14011444                }
     
    14061449        if (par && par->name != ">")
    14071450        {
    1408                 //happens when pos_inout == strlen(genot)
     1451                //happens when pos_inout == genot_len
    14091452                //return pos_inout; //MacKo 2023-04: could report an error instead of silent repair, but repair operators only work in Cells (i.e., after the f4_Node tree has been parsed without errors and Cells can start developing) so we don't want to make a fatal error because of missing '>' here. Also after conversions from Cells to text, trailing '>' is deliberately removed... and also the simplest genotype is officially X, not X>.
    1410                 new f4_Node('>', par, int(strlen(genot)) - 1);
     1453                new f4_Node('>', par, genot_len - 1);
    14111454        }
    14121455
     
    14171460{
    14181461        int pos = 0;
    1419         int res = f4_processRecur(genot, pos, root);
     1462        int res = f4_processRecur(genot, (int)strlen(genot), pos, root);
    14201463        if (res > 0)
    14211464                return res; //parsing error
  • cpp/frams/genetics/f4/f4_general.h

    r1232 r1234  
    471471 * This is an internal function; for regular cases, use f4_process().
    472472 * @param genot the string with the entire genotype
     473 * @param genot_len length of genot (precomputed for efficiency)
    473474 * @param pos_inout the current position of processing in string (advanced by the function)
    474475 * @param parent current parent of the analysed branch of the genotype
    475476 * @return 0 if processing was successful, otherwise returns the position of an error in the genotype
    476477 */
    477 int f4_processRecur(const char *genot, int &pos_inout, f4_Node *parent);
     478int f4_processRecur(const char *genot, const int genot_len, int &pos_inout, f4_Node *parent);
    478479
    479480/**
  • cpp/frams/genetics/f4/f4_oper.cpp

    r1232 r1234  
    1111// may help, but it would be better to improve the source code to make genetic operators neutral in terms of genotype length. Adding such a penalty
    1212// removes "work in progress" changes in genotypes thus promoting immediate, straightforward improvements while hindering slower, multifaceted progress.
    13 // TODO getting rid of redundancy (valid genotypes with a lot of "junk code") in this representation looks like a good idea; many improvements to this end have already been done in April & May 2023.
     13// TODO getting rid of redundancy (valid genotypes with a lot of "junk code") in this representation looks like a good idea; many improvements to this end have already been done in April & May 2023, so maybe it is not a big problem now?
    1414//
    1515//
    1616// TODO the behavior of neuron input indexes during mutation seems badly implemented (see also TREAT_BAD_CONNECTIONS_AS_INVALID_GENO). Are they kept properly maintained when nodes are added and removed? This could be done well because during mutation we operate on the tree structure with cross-references between nodes (so they should not be affected by local changes in the tree), and then convert the tree back to string. Yet, the f4_Node.conn_from is an integer and these fields in nodes do not seem to be maintained on tree node adding/removal... change these integer offsets to references to node objects? But actually, do the offsets that constitute relative connection references concern the f4_Node tree structure (and all these sophisticated calculations of offsets during mutation are useful) or rather they concern the f4_Cells development? verify all situations in f4_Cell::oneStep(), case '['.
    17 // TODO add simplifying sequences of modifiers (so capital and small letter cancel out, like in f1) - but seems like each single modifier is a separate f4_Node? and perhaps we don't want to use the repair mechanism for this... maybe mutations, when they add/modify/remove a modifier node, should be "cleaning" the tree by removing nodes when they encounter contradictory modifiers on the same subpath, and also limit the number of modifiers of each type just like in f1? To avoid sequences like ...<X>llmlIilImmimiimmimifmfl<fifmmimilimmmiimiliffmfliIfififlliflimfliffififmiffmfliflifmIlimimiflimfiffmllliflmimifllifliliflifmIlimimiflimfiffmllliflmimifllfmIlimimiflimfiffmllliflmimiflliflimimmiflimfliffmiflifmfiffllIlififliffififmiffmfliflifIliflimimflimflfflimimifllfflifllfflimlififfiiffifIr<r<...
    1817// TODO in mutation, adding the '#' gene does not seem to be effective. The gene is added and genotypes are valid, but hardly ever #n is effective, i.e., it hardly ever multiplicates body or brain parts... investigate!
    1918// TODO add support for properties of (any class of) neurons - not just sigmoid/force/intertia (':' syntax) for N
    2019// TODO add mapping genotype character ranges for neural [connections]
    2120// TODO change the default branching plane (to match f1) so they do not grow perfectly vertical (cheating vertpos) so easily? (so they require Rr or other modifiers)
     21// TODO for some genotypes, #defining/undefining F4_SIMPLIFY_MODIFIERS produces significantly different phenotypes (e.g. length of some Joint changes from 1.25 to 1.499, coordinates of Parts change, friction of some part changes from 1.28 to 0.32). Comparing f4_Node trees, the simplification works as intended, there are no huge changes apart from removing contradicting modifiers like 'R' and 'r' or 'L' and 'l', and dispersing the modifiers (changed order). There is no reason for such a significant influence of this. A hypothesis is that something may be wrong with calculating the influence of individual modifiers, e.g. some strong nonlinearity is introduced where it should not be, or some compensation between modifiers that should not influence each other (like L and R), or some modifier f4_Nodes are skipped/ignored when applying? Investigate. Example genotype: /*4*/,i<qlM,C<X>N:*#1>>,r<MRF<Xcm>N:Gpart>#5#1#2MLL#1>#1>>>>#5ML#2L#1>>>Lf,r<#1>rM<CqmLlCfqiFLqXFfl><F,<<XI>iN:|[-1:4.346]><XF><<XrRQ>N:G#3>>QiXFMR>fXM#2MfcR>R#3>>X
     22
    2223
    2324
     
    3233
    3334
    34 const char *Geno_f4::all_modifiers = F14_MODIFIERS ","; //comma in f4 is handled the same way (simple node, F4_ADD_SIMP) as modifiers
     35const char *Geno_f4::all_modifiers = F14_MODIFIERS ","; //comma in f4 is handled the same way (simple node, F4_ADD_SIMP) as modifiers. See also all_modifiers_no_comma in f4_general.cpp.
    3536
    3637// codes that can be changed (apart from being added/deleted)
     
    294295                {
    295296                        // add simple node
    296                         // choose a simple node from ADD_SIMPLE_CODES
     297                        int modifier_index = GenoOperators::getRandomChar(all_modifiers, excluded_modifiers.c_str());
     298                        if (modifier_index < 0)
     299                                return GENOPER_OPFAIL;
    297300                        node_mutated->parent->removeChild(node_mutated);
    298                         //f4_Node *n2 = new f4_Node(ADD_SIMPLE_CODES[rndUint(strlen(ADD_SIMPLE_CODES))], n1->parent, n1->parent->pos);
    299                         int modifierid = GenoOperators::getRandomChar(all_modifiers, excluded_modifiers.c_str());
    300                         f4_Node *n2 = new f4_Node(all_modifiers[modifierid], node_mutated->parent, node_mutated->parent->pos);
     301                        // old source: choose a simple node from ADD_SIMPLE_CODES
     302                        //f4_Node *n2 = new f4_Node(ADD_SIMPLE_CODES[rndUint(strlen(ADD_SIMPLE_CODES))], node_mutated->parent, node_mutated->parent->pos);
     303                        f4_Node *n2 = new f4_Node(all_modifiers[modifier_index], node_mutated->parent, node_mutated->parent->pos);
    301304                        n2->addChild(node_mutated);
    302305                        node_mutated->parent = n2;
  • cpp/frams/genetics/f4/f4_oper.h

    r1231 r1234  
    6262        static const char *all_modifiers;
    6363
    64 protected:
     64private:
    6565
    6666        /**
Note: See TracChangeset for help on using the changeset viewer.