1 | import numpy as np |
---|
2 | from pyemd import emd |
---|
3 | from ctypes import cdll |
---|
4 | from ctypes.util import find_library |
---|
5 | from alignmodel import align |
---|
6 | |
---|
7 | class DensityDistribution: |
---|
8 | """Two dissimilarity measures based on the spatial distribution of two Models. The Model bounding box is divided into a grid of equally-sized cuboids, the number of which is the 'resolution' parameter cubed. Then the Model surface is covered with points; the density of the surface sampling is determined by the 'density' parameter. There are two versions of the measure. In the default version ('frequency'=False), a signature of each cuboid is the centroid and the number of samples. In the 'frequency'=True version, FFT is computed from the vector containing the number of samples in each cuboid. The final result of the dissimilarity measure is the distance between the signatures and it can be computed using EMD, L1, or L2 norms (the 'metric' parameter). |
---|
9 | """ |
---|
10 | |
---|
11 | libm = cdll.LoadLibrary(find_library('m')) # for disabling/enabling floating point exceptions (division by zero occurs in the EMD library) |
---|
12 | EPSILON = 0.0001 |
---|
13 | |
---|
14 | def __init__(self, frams_module=None, density = 10, resolution = 8, reduce_empty=True, frequency=False, metric = 'emd', fixedZaxis=False, verbose=False): |
---|
15 | """ __init__ |
---|
16 | Args: |
---|
17 | density (int, optional): density of samplings for frams.ModelGeometry. Defaults to 10. |
---|
18 | resolution (int, optional): How many intervals are used in each dimension to partition surface samples of Models in the 3D space. |
---|
19 | The higher the value, the more detailed the comparison and the longer the calculations. Defaults to 3. |
---|
20 | reduce_empty (bool, optional): If we should use reduction to remove blank samples. Defaults to True. |
---|
21 | frequency (bool, optional): If we should use frequency distribution. Defaults to False. |
---|
22 | metric (string, optional): The distance metric that should be used ('emd', 'l1', or 'l2'). Defaults to 'emd'. |
---|
23 | fixedZaxis (bool, optional): If the z axis should be fixed during alignment. Defaults to False. |
---|
24 | verbose (bool, optional): Turning on logging, works only for calculateEMDforGeno. Defaults to False. |
---|
25 | """ |
---|
26 | if frams_module is None: |
---|
27 | raise ValueError('Framsticks module not provided!') |
---|
28 | self.frams = frams_module |
---|
29 | |
---|
30 | self.density = density |
---|
31 | self.resolution = resolution |
---|
32 | self.verbose = verbose |
---|
33 | self.reduce_empty = reduce_empty |
---|
34 | self.frequency = frequency |
---|
35 | self.metric = metric |
---|
36 | self.fixedZaxis = fixedZaxis |
---|
37 | |
---|
38 | |
---|
39 | def calculateNeighberhood(self,array,mean_coords): |
---|
40 | """ Calculates number of elements for given sample and set ups the center of this sample |
---|
41 | to the center of mass (calculated by mean of every coordinate) |
---|
42 | Args: |
---|
43 | array ([[float,float,float],...,[float,float,float]]): array of voxels that belong to given sample. |
---|
44 | mean_coords ([float,float,float]): default coordinates that are the |
---|
45 | middle of the sample (used when number of voxels in sample is equal to 0) |
---|
46 | |
---|
47 | Returns: |
---|
48 | weight [int]: number of voxels in a sample |
---|
49 | coordinates [float,float,float]: center of mass for a sample |
---|
50 | """ |
---|
51 | weight = len(array) |
---|
52 | if weight > 0: |
---|
53 | point = [np.mean(array[:,0]),np.mean(array[:,1]),np.mean(array[:,2])] |
---|
54 | return weight, point |
---|
55 | else: |
---|
56 | return 0, mean_coords |
---|
57 | |
---|
58 | |
---|
59 | def calculateDistPoints(self,point1, point2): |
---|
60 | """ Returns euclidean distance between two points |
---|
61 | Args (distribution): |
---|
62 | point1 ([float,float,float]) - coordinates of first point |
---|
63 | point2 ([float,float,float]) - coordinates of second point |
---|
64 | Args (frequency): |
---|
65 | point1 (float) - value of the first sample |
---|
66 | point2 (float) - value of the second sample |
---|
67 | |
---|
68 | Returns: |
---|
69 | [float]: euclidean distance |
---|
70 | """ |
---|
71 | if self.frequency: |
---|
72 | return abs(point1-point2) |
---|
73 | else: |
---|
74 | return np.sqrt(np.sum(np.square(point1-point2))) |
---|
75 | |
---|
76 | |
---|
77 | def calculateDistanceMatrix(self,array1, array2): |
---|
78 | """ |
---|
79 | Args: |
---|
80 | array1 ([type]): array of size n with points representing the first Model |
---|
81 | array2 ([type]): array of size n with points representing the second Model |
---|
82 | |
---|
83 | Returns: |
---|
84 | np.array(np.array(,dtype=float)): distance matrix n*n |
---|
85 | """ |
---|
86 | n = len(array1) |
---|
87 | distMatrix = np.zeros((n,n)) |
---|
88 | for i in range(n): |
---|
89 | for j in range(n): |
---|
90 | distMatrix[i][j] = self.calculateDistPoints(array1[i], array2[j]) |
---|
91 | return np.array(distMatrix) |
---|
92 | |
---|
93 | |
---|
94 | def reduceEmptySignatures_Frequency(self,s1,s2): |
---|
95 | """Removes samples from signatures if corresponding samples for both models have weight 0. |
---|
96 | Args: |
---|
97 | s1 (np.array(,dtype=np.float64)): values of samples |
---|
98 | s2 (np.array(,dtype=np.float64)): values of samples |
---|
99 | |
---|
100 | Returns: |
---|
101 | s1new (np.array(,dtype=np.float64)): coordinates of samples after reduction |
---|
102 | s2new (np.array(,dtype=np.float64)): coordinates of samples after reduction |
---|
103 | """ |
---|
104 | lens = len(s1) |
---|
105 | indices = [] |
---|
106 | for i in range(lens): |
---|
107 | if s1[i]==0 and s2[i]==0: |
---|
108 | indices.append(i) |
---|
109 | |
---|
110 | return np.delete(s1, indices), np.delete(s2, indices) |
---|
111 | |
---|
112 | |
---|
113 | def reduceEmptySignatures_Density(self,s1,s2): |
---|
114 | """Removes samples from signatures if corresponding samples for both models have weight 0. |
---|
115 | Args: |
---|
116 | s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] |
---|
117 | s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] |
---|
118 | |
---|
119 | Returns: |
---|
120 | s1new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction |
---|
121 | s2new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction |
---|
122 | """ |
---|
123 | lens = len(s1[0]) |
---|
124 | indices = [] |
---|
125 | for i in range(lens): |
---|
126 | if s1[1][i]==0 and s2[1][i]==0: |
---|
127 | indices.append(i) |
---|
128 | |
---|
129 | s1 = [np.delete(s1[0], indices, axis=0), np.delete(s1[1], indices, axis=0)] |
---|
130 | s2 = [np.delete(s2[0], indices, axis=0), np.delete(s2[1], indices, axis=0)] |
---|
131 | return s1, s2 |
---|
132 | |
---|
133 | |
---|
134 | def getSignatures(self,array,edges3,steps3): |
---|
135 | """Generates signature for array representing the Model. Signature is composed of list of points [x,y,z] (float) and list of weights (int). |
---|
136 | |
---|
137 | Args: |
---|
138 | array (np.array(np.array(,dtype=float))): array with voxels representing the Model |
---|
139 | edges3 ([np.array(,dtype=float),np.array(,dtype=float),np.array(,dtype=float)]): lists with edges for each step for each axis in order x,y,z |
---|
140 | steps3 ([float,float,float]): [size of interval for x axis, size of interval for y axis, size of interval for y axis] |
---|
141 | |
---|
142 | Returns (distribution): |
---|
143 | signature [np.array(,dtype=np.float64),np.array(,dtype=np.float64)]: returns signatuere [np.array of points, np.array of weights] |
---|
144 | Returns (frequency): |
---|
145 | signature np.array(,dtype=np.float64): returns signatuere np.array of coefficients |
---|
146 | """ |
---|
147 | edges_x,edges_y,edges_z = edges3 |
---|
148 | step_x,step_y,step_z=steps3 |
---|
149 | feature_array = [] |
---|
150 | weight_array = [] |
---|
151 | step_x_half = step_x/2 |
---|
152 | step_y_half = step_y/2 |
---|
153 | step_z_half = step_z/2 |
---|
154 | for x in range(len(edges_x[:-1])): |
---|
155 | for y in range(len(edges_y[:-1])) : |
---|
156 | for z in range(len(edges_z[:-1])): |
---|
157 | rows=np.where((array[:,0]> edges_x[x]) & |
---|
158 | (array[:,0]<= edges_x[x+1]) & |
---|
159 | (array[:,1]> edges_y[y]) & |
---|
160 | (array[:,1]<= edges_y[y+1]) & |
---|
161 | (array[:,2]> edges_z[z]) & |
---|
162 | (array[:,2]<= edges_z[z+1])) |
---|
163 | if self.frequency: |
---|
164 | feature_array.append(len(array[rows])) |
---|
165 | else: |
---|
166 | weight, point = self.calculateNeighberhood(array[rows],[edges_x[x]+step_x_half,edges_y[y]+step_y_half,edges_z[z]+step_z_half]) |
---|
167 | feature_array.append(point) |
---|
168 | weight_array.append(weight) |
---|
169 | |
---|
170 | if self.frequency: |
---|
171 | samples = np.array(feature_array,dtype=np.float64) |
---|
172 | return samples |
---|
173 | else: |
---|
174 | return [np.array(feature_array,dtype=np.float64), np.array(weight_array,dtype=np.float64)] |
---|
175 | |
---|
176 | |
---|
177 | def getSignaturesForPair(self,array1,array2): |
---|
178 | """Generates signatures for given pair of models represented by array of voxels. |
---|
179 | We calculate space for given models by taking the extremas for each axis and dividing the space by the resolution. |
---|
180 | This divided space generate us samples which contains points. Each sample will have new coordinates which are mean of all points from it and weight which equals to the number of points. |
---|
181 | |
---|
182 | Args: |
---|
183 | array1 (np.array(np.array(,dtype=float))): array with voxels representing model1 |
---|
184 | array2 (np.array(np.array(,dtype=float))): array with voxels representing model2 |
---|
185 | |
---|
186 | Returns: |
---|
187 | s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] |
---|
188 | s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] |
---|
189 | """ |
---|
190 | |
---|
191 | min_x = np.min([np.min(array1[:,0]),np.min(array2[:,0])]) |
---|
192 | max_x = np.max([np.max(array1[:,0]),np.max(array2[:,0])]) |
---|
193 | min_y = np.min([np.min(array1[:,1]),np.min(array2[:,1])]) |
---|
194 | max_y = np.max([np.max(array1[:,1]),np.max(array2[:,1])]) |
---|
195 | min_z = np.min([np.min(array1[:,2]),np.min(array2[:,2])]) |
---|
196 | max_z = np.max([np.max(array1[:,2]),np.max(array2[:,2])]) |
---|
197 | |
---|
198 | # We request self.resolution+1 samples since we need self.resolution intervals |
---|
199 | edges_x,step_x = np.linspace(min_x,max_x,self.resolution+1,retstep=True) |
---|
200 | edges_y,step_y = np.linspace(min_y,max_y,self.resolution+1,retstep=True) |
---|
201 | edges_z,step_z = np.linspace(min_z,max_z,self.resolution+1,retstep=True) |
---|
202 | |
---|
203 | for edges in (edges_x, edges_y, edges_z): # EPSILON subtracted to deal with boundary voxels (one-sided open intervals and comparisons in loops in function getSignatures()) |
---|
204 | edges[0] -= self.EPSILON |
---|
205 | |
---|
206 | edges3 = (edges_x,edges_y,edges_z) |
---|
207 | steps3 = (step_x,step_y,step_z) |
---|
208 | |
---|
209 | s1 = self.getSignatures(array1,edges3,steps3) |
---|
210 | s2 = self.getSignatures(array2,edges3,steps3) |
---|
211 | |
---|
212 | return s1,s2 |
---|
213 | |
---|
214 | |
---|
215 | def getVoxels(self,geno): |
---|
216 | """Generates voxels for genotype using frams.ModelGeometry |
---|
217 | |
---|
218 | Args: |
---|
219 | geno (string): representation of Model in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html |
---|
220 | |
---|
221 | Returns: |
---|
222 | np.array([np.array(,dtype=float)]: list of voxels representing the Model. |
---|
223 | """ |
---|
224 | model = self.frams.Model.newFromString(geno) |
---|
225 | align(model, self.fixedZaxis) |
---|
226 | model_geometry = self.frams.ModelGeometry.forModel(model) |
---|
227 | |
---|
228 | model_geometry.geom_density = self.density |
---|
229 | voxels = np.array([np.array([p.x._value(),p.y._value(),p.z._value()]) for p in model_geometry.voxels()]) |
---|
230 | return voxels |
---|
231 | |
---|
232 | |
---|
233 | def normalize(self, signature): |
---|
234 | """Normalizes the signature values by dividing each element by the sum of all elements. |
---|
235 | Args: |
---|
236 | signature np.array(,dtype=float): A one-dimensional array of signature values. |
---|
237 | |
---|
238 | Returns: |
---|
239 | np.array(,dtype=float): A one-dimensional array of normalized signature values. |
---|
240 | """ |
---|
241 | total = np.sum(signature) |
---|
242 | return np.divide(signature, total) |
---|
243 | |
---|
244 | |
---|
245 | def calculateDissimforVoxels(self, voxels1, voxels2): |
---|
246 | """Calculates EMD for pair of voxels representing models. |
---|
247 | Args: |
---|
248 | voxels1 np.array([np.array(,dtype=float)]: list of voxels representing model1. |
---|
249 | voxels2 np.array([np.array(,dtype=float)]: list of voxels representing model2. |
---|
250 | |
---|
251 | Returns: |
---|
252 | float: dissim for pair of list of voxels |
---|
253 | """ |
---|
254 | numvox1 = len(voxels1) |
---|
255 | numvox2 = len(voxels2) |
---|
256 | |
---|
257 | s1, s2 = self.getSignaturesForPair(voxels1, voxels2) |
---|
258 | |
---|
259 | reduce_fun = self.reduceEmptySignatures_Frequency if self.frequency else self.reduceEmptySignatures_Density |
---|
260 | if self.reduce_empty: |
---|
261 | s1, s2 = reduce_fun(s1,s2) |
---|
262 | |
---|
263 | if not self.frequency: |
---|
264 | if numvox1 != sum(s1[1]) or numvox2 != sum(s2[1]): |
---|
265 | print("Voxel reduction didn't work properly") |
---|
266 | print("Base voxels fig1: ", numvox1, " fig2: ", numvox2) |
---|
267 | print("After reduction voxels fig1: ", sum(s1[1]), " fig2: ", sum(s2[1])) |
---|
268 | raise RuntimeError("Voxel reduction error!") |
---|
269 | |
---|
270 | if self.frequency: |
---|
271 | s1 = abs(np.fft.fft(s1)) |
---|
272 | s2 = abs(np.fft.fft(s2)) |
---|
273 | |
---|
274 | if self.metric == 'l1': |
---|
275 | if self.frequency: |
---|
276 | out = np.linalg.norm((s1-s2), ord=1) |
---|
277 | else: |
---|
278 | out = np.linalg.norm((s1[1]-s2[1]), ord=1) |
---|
279 | |
---|
280 | elif self.metric == 'l2': |
---|
281 | if self.frequency: |
---|
282 | out = np.linalg.norm((s1-s2)) |
---|
283 | else: |
---|
284 | out = np.linalg.norm((s1[1]-s2[1])) |
---|
285 | |
---|
286 | elif self.metric == 'emd': |
---|
287 | if self.frequency: |
---|
288 | num_points = np.linspace(0, 1, len(s1), True) |
---|
289 | dist_matrix = self.calculateDistanceMatrix(num_points,num_points) |
---|
290 | else: |
---|
291 | dist_matrix = self.calculateDistanceMatrix(s1[0],s2[0]) |
---|
292 | |
---|
293 | self.libm.fedisableexcept(0x04) # change default flag value - don't cause exceptions when dividing by 0 (pyemd does it) |
---|
294 | |
---|
295 | if self.frequency: |
---|
296 | out = emd(self.normalize(s1),self.normalize(s2),np.array(dist_matrix,dtype=np.float64)) |
---|
297 | else: |
---|
298 | out = emd(self.normalize(s1[1]),self.normalize(s2[1]),dist_matrix) |
---|
299 | |
---|
300 | self.libm.feclearexcept(0x04) # restoring default flag values... |
---|
301 | self.libm.feenableexcept(0x04) |
---|
302 | |
---|
303 | else: |
---|
304 | raise ValueError("Wrong metric '%s'"%self.metric) |
---|
305 | |
---|
306 | return out |
---|
307 | |
---|
308 | |
---|
309 | def calculateDissimforGeno(self, geno1, geno2): |
---|
310 | """Calculates EMD for a pair of genotypes. |
---|
311 | Args: |
---|
312 | geno1 (string): representation of model1 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html |
---|
313 | geno2 (string): representation of model2 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html |
---|
314 | |
---|
315 | Returns: |
---|
316 | float: dissim for pair of strings representing models. |
---|
317 | """ |
---|
318 | |
---|
319 | voxels1 = self.getVoxels(geno1) |
---|
320 | voxels2 = self.getVoxels(geno2) |
---|
321 | |
---|
322 | out = self.calculateDissimforVoxels(voxels1, voxels2) |
---|
323 | |
---|
324 | if self.verbose == True: |
---|
325 | print("Intervals: ", self.resolution) |
---|
326 | print("Geno1:\n",geno1) |
---|
327 | print("Geno2:\n",geno2) |
---|
328 | print("EMD:\n",out) |
---|
329 | |
---|
330 | return out |
---|
331 | |
---|
332 | |
---|
333 | def getDissimilarityMatrix(self,listOfGeno): |
---|
334 | """ |
---|
335 | Args: |
---|
336 | listOfGeno ([string]): list of strings representing genotypes in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html |
---|
337 | |
---|
338 | Returns: |
---|
339 | np.array(np.array(,dtype=float)): dissimilarity matrix of EMD for given list of genotypes |
---|
340 | """ |
---|
341 | numOfGeno = len(listOfGeno) |
---|
342 | dissimMatrix = np.zeros(shape=[numOfGeno,numOfGeno]) |
---|
343 | listOfVoxels = [self.getVoxels(g) for g in listOfGeno] |
---|
344 | for i in range(numOfGeno): |
---|
345 | for j in range(numOfGeno): |
---|
346 | dissimMatrix[i,j] = self.calculateDissimforVoxels(listOfVoxels[i], listOfVoxels[j]) |
---|
347 | return dissimMatrix |
---|