[1208] | 1 | import numpy as np |
---|
| 2 | from pyemd import emd |
---|
| 3 | from ctypes import cdll |
---|
| 4 | from ctypes.util import find_library |
---|
[1295] | 5 | from .alignmodel import align |
---|
[1208] | 6 | |
---|
| 7 | class DensityDistribution: |
---|
[1224] | 8 | """Two dissimilarity measures based on the spatial distribution of two Models. The Model bounding box is divided into a grid of equally-sized cuboids, the number of which is the 'resolution' parameter cubed. Then the Model surface is covered with points; the density of the surface sampling is determined by the 'density' parameter. There are two versions of the measure. In the default version ('frequency'=False), a signature of each cuboid is the centroid and the number of samples. In the 'frequency'=True version, FFT is computed from the vector containing the number of samples in each cuboid. The final result of the dissimilarity measure is the distance between the signatures and it can be computed using EMD, L1, or L2 norms (the 'metric' parameter). |
---|
[1223] | 9 | """ |
---|
[1295] | 10 | |
---|
[1322] | 11 | libm = find_library('m') # for disabling/enabling floating point exceptions (division by zero occurs in the pyemd library) |
---|
| 12 | if libm is not None: # libm.so (the mathematical library) is a part of Linux ecosystem - always present |
---|
| 13 | libm = cdll.LoadLibrary(libm) |
---|
| 14 | else: |
---|
| 15 | print('\nWarning: The "m" library not found - floating point exceptions in pyemd may occur...\n') # but on Windows, pyemd does not seem to cause floating point exceptions |
---|
[1208] | 16 | EPSILON = 0.0001 |
---|
[1224] | 17 | |
---|
[1225] | 18 | def __init__(self, frams_module=None, density = 10, resolution = 8, reduce_empty=True, frequency=False, metric = 'emd', fixedZaxis=False, verbose=False): |
---|
[1208] | 19 | """ __init__ |
---|
| 20 | Args: |
---|
[1224] | 21 | density (int, optional): density of samplings for frams.ModelGeometry. Defaults to 10. |
---|
| 22 | resolution (int, optional): How many intervals are used in each dimension to partition surface samples of Models in the 3D space. |
---|
| 23 | The higher the value, the more detailed the comparison and the longer the calculations. Defaults to 3. |
---|
| 24 | reduce_empty (bool, optional): If we should use reduction to remove blank samples. Defaults to True. |
---|
[1208] | 25 | frequency (bool, optional): If we should use frequency distribution. Defaults to False. |
---|
| 26 | metric (string, optional): The distance metric that should be used ('emd', 'l1', or 'l2'). Defaults to 'emd'. |
---|
| 27 | fixedZaxis (bool, optional): If the z axis should be fixed during alignment. Defaults to False. |
---|
| 28 | verbose (bool, optional): Turning on logging, works only for calculateEMDforGeno. Defaults to False. |
---|
| 29 | """ |
---|
[1219] | 30 | if frams_module is None: |
---|
| 31 | raise ValueError('Framsticks module not provided!') |
---|
| 32 | self.frams = frams_module |
---|
[1208] | 33 | |
---|
| 34 | self.density = density |
---|
[1224] | 35 | self.resolution = resolution |
---|
[1208] | 36 | self.verbose = verbose |
---|
[1224] | 37 | self.reduce_empty = reduce_empty |
---|
[1208] | 38 | self.frequency = frequency |
---|
| 39 | self.metric = metric |
---|
| 40 | self.fixedZaxis = fixedZaxis |
---|
| 41 | |
---|
| 42 | |
---|
[1325] | 43 | def calculateNeighborhood(self,array,mean_coords): |
---|
[1208] | 44 | """ Calculates number of elements for given sample and set ups the center of this sample |
---|
| 45 | to the center of mass (calculated by mean of every coordinate) |
---|
| 46 | Args: |
---|
| 47 | array ([[float,float,float],...,[float,float,float]]): array of voxels that belong to given sample. |
---|
| 48 | mean_coords ([float,float,float]): default coordinates that are the |
---|
| 49 | middle of the sample (used when number of voxels in sample is equal to 0) |
---|
| 50 | |
---|
| 51 | Returns: |
---|
| 52 | weight [int]: number of voxels in a sample |
---|
| 53 | coordinates [float,float,float]: center of mass for a sample |
---|
| 54 | """ |
---|
| 55 | weight = len(array) |
---|
| 56 | if weight > 0: |
---|
[1325] | 57 | point = np.mean(array, axis=0) # equivalent to [np.mean(array[:,0]),np.mean(array[:,1]),np.mean(array[:,2])] |
---|
[1208] | 58 | return weight, point |
---|
| 59 | else: |
---|
| 60 | return 0, mean_coords |
---|
| 61 | |
---|
| 62 | |
---|
| 63 | def calculateDistPoints(self,point1, point2): |
---|
[1325] | 64 | """ Returns Euclidean distance between two points |
---|
[1208] | 65 | Args (distribution): |
---|
| 66 | point1 ([float,float,float]) - coordinates of first point |
---|
| 67 | point2 ([float,float,float]) - coordinates of second point |
---|
| 68 | Args (frequency): |
---|
| 69 | point1 (float) - value of the first sample |
---|
| 70 | point2 (float) - value of the second sample |
---|
| 71 | |
---|
| 72 | Returns: |
---|
[1325] | 73 | [float]: Euclidean distance |
---|
[1208] | 74 | """ |
---|
| 75 | if self.frequency: |
---|
[1325] | 76 | return abs(point1-point2) # TODO vector instead of scalar returned? |
---|
[1208] | 77 | else: |
---|
[1325] | 78 | return np.linalg.norm(point1-point2, ord=2) |
---|
[1208] | 79 | |
---|
| 80 | |
---|
| 81 | def calculateDistanceMatrix(self,array1, array2): |
---|
| 82 | """ |
---|
| 83 | Args: |
---|
[1224] | 84 | array1 ([type]): array of size n with points representing the first Model |
---|
| 85 | array2 ([type]): array of size n with points representing the second Model |
---|
[1208] | 86 | |
---|
| 87 | Returns: |
---|
[1224] | 88 | np.array(np.array(,dtype=float)): distance matrix n*n |
---|
[1208] | 89 | """ |
---|
| 90 | n = len(array1) |
---|
| 91 | distMatrix = np.zeros((n,n)) |
---|
| 92 | for i in range(n): |
---|
| 93 | for j in range(n): |
---|
| 94 | distMatrix[i][j] = self.calculateDistPoints(array1[i], array2[j]) |
---|
| 95 | return np.array(distMatrix) |
---|
| 96 | |
---|
| 97 | |
---|
[1224] | 98 | def reduceEmptySignatures_Frequency(self,s1,s2): |
---|
[1208] | 99 | """Removes samples from signatures if corresponding samples for both models have weight 0. |
---|
| 100 | Args: |
---|
| 101 | s1 (np.array(,dtype=np.float64)): values of samples |
---|
| 102 | s2 (np.array(,dtype=np.float64)): values of samples |
---|
| 103 | |
---|
| 104 | Returns: |
---|
| 105 | s1new (np.array(,dtype=np.float64)): coordinates of samples after reduction |
---|
| 106 | s2new (np.array(,dtype=np.float64)): coordinates of samples after reduction |
---|
| 107 | """ |
---|
| 108 | lens = len(s1) |
---|
[1325] | 109 | indices = [i for i in range(lens) if s1[i]==0 and s2[i]==0] |
---|
[1208] | 110 | |
---|
| 111 | return np.delete(s1, indices), np.delete(s2, indices) |
---|
| 112 | |
---|
| 113 | |
---|
[1224] | 114 | def reduceEmptySignatures_Density(self,s1,s2): |
---|
[1208] | 115 | """Removes samples from signatures if corresponding samples for both models have weight 0. |
---|
| 116 | Args: |
---|
| 117 | s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] |
---|
| 118 | s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] |
---|
| 119 | |
---|
| 120 | Returns: |
---|
| 121 | s1new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction |
---|
| 122 | s2new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction |
---|
| 123 | """ |
---|
| 124 | lens = len(s1[0]) |
---|
[1325] | 125 | indices = [i for i in range(lens) if s1[1][i]==0 and s2[1][i]==0] |
---|
[1208] | 126 | |
---|
| 127 | s1 = [np.delete(s1[0], indices, axis=0), np.delete(s1[1], indices, axis=0)] |
---|
| 128 | s2 = [np.delete(s2[0], indices, axis=0), np.delete(s2[1], indices, axis=0)] |
---|
| 129 | return s1, s2 |
---|
| 130 | |
---|
| 131 | |
---|
[1224] | 132 | def getSignatures(self,array,edges3,steps3): |
---|
| 133 | """Generates signature for array representing the Model. Signature is composed of list of points [x,y,z] (float) and list of weights (int). |
---|
[1208] | 134 | |
---|
| 135 | Args: |
---|
[1224] | 136 | array (np.array(np.array(,dtype=float))): array with voxels representing the Model |
---|
| 137 | edges3 ([np.array(,dtype=float),np.array(,dtype=float),np.array(,dtype=float)]): lists with edges for each step for each axis in order x,y,z |
---|
| 138 | steps3 ([float,float,float]): [size of interval for x axis, size of interval for y axis, size of interval for y axis] |
---|
[1208] | 139 | |
---|
| 140 | Returns (distribution): |
---|
| 141 | signature [np.array(,dtype=np.float64),np.array(,dtype=np.float64)]: returns signatuere [np.array of points, np.array of weights] |
---|
| 142 | Returns (frequency): |
---|
| 143 | signature np.array(,dtype=np.float64): returns signatuere np.array of coefficients |
---|
| 144 | """ |
---|
[1224] | 145 | edges_x,edges_y,edges_z = edges3 |
---|
| 146 | step_x,step_y,step_z=steps3 |
---|
[1208] | 147 | feature_array = [] |
---|
| 148 | weight_array = [] |
---|
[1224] | 149 | step_x_half = step_x/2 |
---|
| 150 | step_y_half = step_y/2 |
---|
| 151 | step_z_half = step_z/2 |
---|
| 152 | for x in range(len(edges_x[:-1])): |
---|
| 153 | for y in range(len(edges_y[:-1])) : |
---|
| 154 | for z in range(len(edges_z[:-1])): |
---|
| 155 | rows=np.where((array[:,0]> edges_x[x]) & |
---|
| 156 | (array[:,0]<= edges_x[x+1]) & |
---|
| 157 | (array[:,1]> edges_y[y]) & |
---|
| 158 | (array[:,1]<= edges_y[y+1]) & |
---|
| 159 | (array[:,2]> edges_z[z]) & |
---|
| 160 | (array[:,2]<= edges_z[z+1])) |
---|
[1208] | 161 | if self.frequency: |
---|
| 162 | feature_array.append(len(array[rows])) |
---|
| 163 | else: |
---|
[1325] | 164 | weight, point = self.calculateNeighborhood(array[rows],[edges_x[x]+step_x_half,edges_y[y]+step_y_half,edges_z[z]+step_z_half]) |
---|
[1208] | 165 | feature_array.append(point) |
---|
| 166 | weight_array.append(weight) |
---|
| 167 | |
---|
| 168 | if self.frequency: |
---|
| 169 | samples = np.array(feature_array,dtype=np.float64) |
---|
[1269] | 170 | return samples |
---|
[1208] | 171 | else: |
---|
| 172 | return [np.array(feature_array,dtype=np.float64), np.array(weight_array,dtype=np.float64)] |
---|
| 173 | |
---|
| 174 | |
---|
| 175 | def getSignaturesForPair(self,array1,array2): |
---|
[1325] | 176 | """Generates signatures for a given pair of models represented by array of voxels. |
---|
[1224] | 177 | We calculate space for given models by taking the extremas for each axis and dividing the space by the resolution. |
---|
| 178 | This divided space generate us samples which contains points. Each sample will have new coordinates which are mean of all points from it and weight which equals to the number of points. |
---|
[1208] | 179 | |
---|
| 180 | Args: |
---|
| 181 | array1 (np.array(np.array(,dtype=float))): array with voxels representing model1 |
---|
| 182 | array2 (np.array(np.array(,dtype=float))): array with voxels representing model2 |
---|
[1224] | 183 | |
---|
[1208] | 184 | Returns: |
---|
| 185 | s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] |
---|
| 186 | s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] |
---|
| 187 | """ |
---|
| 188 | |
---|
[1221] | 189 | min_x = np.min([np.min(array1[:,0]),np.min(array2[:,0])]) |
---|
[1210] | 190 | max_x = np.max([np.max(array1[:,0]),np.max(array2[:,0])]) |
---|
| 191 | min_y = np.min([np.min(array1[:,1]),np.min(array2[:,1])]) |
---|
| 192 | max_y = np.max([np.max(array1[:,1]),np.max(array2[:,1])]) |
---|
| 193 | min_z = np.min([np.min(array1[:,2]),np.min(array2[:,2])]) |
---|
| 194 | max_z = np.max([np.max(array1[:,2]),np.max(array2[:,2])]) |
---|
[1208] | 195 | |
---|
[1224] | 196 | # We request self.resolution+1 samples since we need self.resolution intervals |
---|
| 197 | edges_x,step_x = np.linspace(min_x,max_x,self.resolution+1,retstep=True) |
---|
| 198 | edges_y,step_y = np.linspace(min_y,max_y,self.resolution+1,retstep=True) |
---|
| 199 | edges_z,step_z = np.linspace(min_z,max_z,self.resolution+1,retstep=True) |
---|
[1208] | 200 | |
---|
[1224] | 201 | for edges in (edges_x, edges_y, edges_z): # EPSILON subtracted to deal with boundary voxels (one-sided open intervals and comparisons in loops in function getSignatures()) |
---|
| 202 | edges[0] -= self.EPSILON |
---|
[1208] | 203 | |
---|
[1224] | 204 | edges3 = (edges_x,edges_y,edges_z) |
---|
| 205 | steps3 = (step_x,step_y,step_z) |
---|
[1208] | 206 | |
---|
[1224] | 207 | s1 = self.getSignatures(array1,edges3,steps3) |
---|
| 208 | s2 = self.getSignatures(array2,edges3,steps3) |
---|
[1208] | 209 | |
---|
| 210 | return s1,s2 |
---|
| 211 | |
---|
| 212 | |
---|
| 213 | def getVoxels(self,geno): |
---|
[1214] | 214 | """Generates voxels for genotype using frams.ModelGeometry |
---|
[1208] | 215 | |
---|
| 216 | Args: |
---|
[1224] | 217 | geno (string): representation of Model in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html |
---|
[1208] | 218 | |
---|
| 219 | Returns: |
---|
[1224] | 220 | np.array([np.array(,dtype=float)]: list of voxels representing the Model. |
---|
[1208] | 221 | """ |
---|
[1219] | 222 | model = self.frams.Model.newFromString(geno) |
---|
[1208] | 223 | align(model, self.fixedZaxis) |
---|
[1219] | 224 | model_geometry = self.frams.ModelGeometry.forModel(model) |
---|
[1208] | 225 | |
---|
| 226 | model_geometry.geom_density = self.density |
---|
| 227 | voxels = np.array([np.array([p.x._value(),p.y._value(),p.z._value()]) for p in model_geometry.voxels()]) |
---|
| 228 | return voxels |
---|
| 229 | |
---|
| 230 | |
---|
[1266] | 231 | def normalize(self, signature): |
---|
[1267] | 232 | """Normalizes the signature values by dividing each element by the sum of all elements. |
---|
[1266] | 233 | Args: |
---|
[1267] | 234 | signature np.array(,dtype=float): A one-dimensional array of signature values. |
---|
[1266] | 235 | |
---|
| 236 | Returns: |
---|
[1267] | 237 | np.array(,dtype=float): A one-dimensional array of normalized signature values. |
---|
[1266] | 238 | """ |
---|
| 239 | total = np.sum(signature) |
---|
| 240 | return np.divide(signature, total) |
---|
| 241 | |
---|
| 242 | |
---|
[1208] | 243 | def calculateDissimforVoxels(self, voxels1, voxels2): |
---|
[1325] | 244 | """Calculates EMD for a pair of voxels representing models. |
---|
[1208] | 245 | Args: |
---|
| 246 | voxels1 np.array([np.array(,dtype=float)]: list of voxels representing model1. |
---|
| 247 | voxels2 np.array([np.array(,dtype=float)]: list of voxels representing model2. |
---|
| 248 | |
---|
| 249 | Returns: |
---|
[1325] | 250 | float: dissim for a pair of list of voxels |
---|
[1208] | 251 | """ |
---|
| 252 | numvox1 = len(voxels1) |
---|
| 253 | numvox2 = len(voxels2) |
---|
| 254 | |
---|
| 255 | s1, s2 = self.getSignaturesForPair(voxels1, voxels2) |
---|
| 256 | |
---|
[1224] | 257 | reduce_fun = self.reduceEmptySignatures_Frequency if self.frequency else self.reduceEmptySignatures_Density |
---|
| 258 | if self.reduce_empty: |
---|
[1208] | 259 | s1, s2 = reduce_fun(s1,s2) |
---|
| 260 | |
---|
| 261 | if not self.frequency: |
---|
| 262 | if numvox1 != sum(s1[1]) or numvox2 != sum(s2[1]): |
---|
[1224] | 263 | print("Voxel reduction didn't work properly") |
---|
[1208] | 264 | print("Base voxels fig1: ", numvox1, " fig2: ", numvox2) |
---|
| 265 | print("After reduction voxels fig1: ", sum(s1[1]), " fig2: ", sum(s2[1])) |
---|
[1224] | 266 | raise RuntimeError("Voxel reduction error!") |
---|
[1208] | 267 | |
---|
[1269] | 268 | if self.frequency: |
---|
| 269 | s1 = abs(np.fft.fft(s1)) |
---|
| 270 | s2 = abs(np.fft.fft(s2)) |
---|
| 271 | |
---|
[1208] | 272 | if self.metric == 'l1': |
---|
| 273 | if self.frequency: |
---|
[1325] | 274 | out = np.linalg.norm(s1-s2, ord=1) |
---|
[1208] | 275 | else: |
---|
[1325] | 276 | out = np.linalg.norm(s1[1]-s2[1], ord=1) |
---|
[1208] | 277 | |
---|
| 278 | elif self.metric == 'l2': |
---|
| 279 | if self.frequency: |
---|
[1325] | 280 | out = np.linalg.norm(s1-s2) |
---|
[1208] | 281 | else: |
---|
[1325] | 282 | out = np.linalg.norm(s1[1]-s2[1]) |
---|
[1208] | 283 | |
---|
| 284 | elif self.metric == 'emd': |
---|
| 285 | if self.frequency: |
---|
[1268] | 286 | num_points = np.linspace(0, 1, len(s1), True) |
---|
| 287 | dist_matrix = self.calculateDistanceMatrix(num_points,num_points) |
---|
[1208] | 288 | else: |
---|
| 289 | dist_matrix = self.calculateDistanceMatrix(s1[0],s2[0]) |
---|
| 290 | |
---|
[1322] | 291 | if self.libm is not None: |
---|
| 292 | self.libm.fedisableexcept(0x04) # change default flag value - don't cause "Floating point exception" when dividing by 0 (pyemd does that, for example when comparing two identical histograms, i.e., two identical signatures, for example from two identical phenotypes) |
---|
[1208] | 293 | |
---|
| 294 | if self.frequency: |
---|
[1266] | 295 | out = emd(self.normalize(s1),self.normalize(s2),np.array(dist_matrix,dtype=np.float64)) |
---|
[1208] | 296 | else: |
---|
[1266] | 297 | out = emd(self.normalize(s1[1]),self.normalize(s2[1]),dist_matrix) |
---|
[1208] | 298 | |
---|
[1322] | 299 | if self.libm is not None: |
---|
| 300 | self.libm.feclearexcept(0x04) # restoring default flag values... |
---|
[1295] | 301 | self.libm.feenableexcept(0x04) |
---|
[1208] | 302 | |
---|
| 303 | else: |
---|
| 304 | raise ValueError("Wrong metric '%s'"%self.metric) |
---|
| 305 | |
---|
| 306 | return out |
---|
| 307 | |
---|
| 308 | |
---|
| 309 | def calculateDissimforGeno(self, geno1, geno2): |
---|
[1219] | 310 | """Calculates EMD for a pair of genotypes. |
---|
[1208] | 311 | Args: |
---|
[1224] | 312 | geno1 (string): representation of model1 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html |
---|
| 313 | geno2 (string): representation of model2 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html |
---|
[1208] | 314 | |
---|
| 315 | Returns: |
---|
[1325] | 316 | float: dissim for a pair of strings representing models. |
---|
[1208] | 317 | """ |
---|
| 318 | |
---|
| 319 | voxels1 = self.getVoxels(geno1) |
---|
| 320 | voxels2 = self.getVoxels(geno2) |
---|
| 321 | |
---|
| 322 | out = self.calculateDissimforVoxels(voxels1, voxels2) |
---|
| 323 | |
---|
| 324 | if self.verbose == True: |
---|
[1224] | 325 | print("Intervals: ", self.resolution) |
---|
[1208] | 326 | print("Geno1:\n",geno1) |
---|
| 327 | print("Geno2:\n",geno2) |
---|
| 328 | print("EMD:\n",out) |
---|
| 329 | |
---|
| 330 | return out |
---|
| 331 | |
---|
| 332 | |
---|
| 333 | def getDissimilarityMatrix(self,listOfGeno): |
---|
| 334 | """ |
---|
| 335 | Args: |
---|
[1224] | 336 | listOfGeno ([string]): list of strings representing genotypes in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html |
---|
[1208] | 337 | |
---|
| 338 | Returns: |
---|
| 339 | np.array(np.array(,dtype=float)): dissimilarity matrix of EMD for given list of genotypes |
---|
| 340 | """ |
---|
| 341 | numOfGeno = len(listOfGeno) |
---|
| 342 | dissimMatrix = np.zeros(shape=[numOfGeno,numOfGeno]) |
---|
| 343 | listOfVoxels = [self.getVoxels(g) for g in listOfGeno] |
---|
| 344 | for i in range(numOfGeno): |
---|
[1325] | 345 | for j in range(numOfGeno): # could only calculate a triangle if the definition of similarity and its calculation guarantees symmetry |
---|
[1208] | 346 | dissimMatrix[i,j] = self.calculateDissimforVoxels(listOfVoxels[i], listOfVoxels[j]) |
---|
| 347 | return dissimMatrix |
---|