source: framspy/dissimilarity/density-distribution.py @ 1224

Last change on this file since 1224 was 1224, checked in by Maciej Komosinski, 20 months ago

Less ambiguous names of variables, improved docs

File size: 15.4 KB
Line 
1import numpy as np
2from pyemd import emd
3from ctypes import cdll
4from ctypes.util import find_library
5from alignmodel import align
6
7class DensityDistribution:
8    """Two dissimilarity measures based on the spatial distribution of two Models. The Model bounding box is divided into a grid of equally-sized cuboids, the number of which is the 'resolution' parameter cubed. Then the Model surface is covered with points; the density of the surface sampling is determined by the 'density' parameter. There are two versions of the measure. In the default version ('frequency'=False), a signature of each cuboid is the centroid and the number of samples. In the 'frequency'=True version, FFT is computed from the vector containing the number of samples in each cuboid. The final result of the dissimilarity measure is the distance between the signatures and it can be computed using EMD, L1, or L2 norms (the 'metric' parameter).
9    """
10   
11    libm = cdll.LoadLibrary(find_library('m')) # for disabling/enabling floating point exceptions (division by zero occurs in the EMD library)
12    EPSILON = 0.0001
13   
14    def __init__(self, frams_module=None, density = 10, resolution = 3, reduce_empty=True, frequency=False, metric = 'emd', fixedZaxis=False, verbose=False):
15        """ __init__
16        Args:
17            density (int, optional): density of samplings for frams.ModelGeometry. Defaults to 10.
18            resolution (int, optional): How many intervals are used in each dimension to partition surface samples of Models in the 3D space.
19                The higher the value, the more detailed the comparison and the longer the calculations. Defaults to 3.
20            reduce_empty (bool, optional): If we should use reduction to remove blank samples. Defaults to True.
21            frequency (bool, optional): If we should use frequency distribution. Defaults to False.
22            metric (string, optional): The distance metric that should be used ('emd', 'l1', or 'l2'). Defaults to 'emd'.
23            fixedZaxis (bool, optional): If the z axis should be fixed during alignment. Defaults to False.
24            verbose (bool, optional): Turning on logging, works only for calculateEMDforGeno. Defaults to False.           
25        """
26        if frams_module is None:
27            raise ValueError('Framsticks module not provided!')
28        self.frams = frams_module
29
30        self.density = density
31        self.resolution = resolution
32        self.verbose = verbose
33        self.reduce_empty = reduce_empty
34        self.frequency = frequency
35        self.metric = metric
36        self.fixedZaxis = fixedZaxis
37
38
39    def calculateNeighberhood(self,array,mean_coords):
40        """ Calculates number of elements for given sample and set ups the center of this sample
41        to the center of mass (calculated by mean of every coordinate)
42        Args:
43            array ([[float,float,float],...,[float,float,float]]): array of voxels that belong to given sample.
44            mean_coords ([float,float,float]): default coordinates that are the
45                middle of the sample (used when number of voxels in sample is equal to 0)
46
47        Returns:
48            weight [int]: number of voxels in a sample
49            coordinates [float,float,float]: center of mass for a sample
50        """
51        weight = len(array)
52        if weight > 0:
53            point = [np.mean(array[:,0]),np.mean(array[:,1]),np.mean(array[:,2])]
54            return weight, point
55        else:
56            return 0, mean_coords
57
58
59    def calculateDistPoints(self,point1, point2):
60        """ Returns euclidean distance between two points
61        Args (distribution):
62            point1 ([float,float,float]) - coordinates of first point
63            point2 ([float,float,float]) - coordinates of second point
64        Args (frequency):
65            point1 (float) - value of the first sample
66            point2 (float) - value of the second sample
67
68        Returns:
69            [float]: euclidean distance
70        """
71        if self.frequency:
72            return abs(point1-point2)
73        else:
74            return np.sqrt(np.sum(np.square(point1-point2)))
75
76
77    def calculateDistanceMatrix(self,array1, array2):
78        """
79        Args:
80            array1 ([type]): array of size n with points representing the first Model
81            array2 ([type]): array of size n with points representing the second Model
82
83        Returns:
84            np.array(np.array(,dtype=float)): distance matrix n*n
85        """
86        n = len(array1)
87        distMatrix = np.zeros((n,n))
88        for i in range(n):
89            for j in range(n):
90                distMatrix[i][j] = self.calculateDistPoints(array1[i], array2[j])
91        return np.array(distMatrix)
92
93
94    def reduceEmptySignatures_Frequency(self,s1,s2):
95        """Removes samples from signatures if corresponding samples for both models have weight 0.
96        Args:
97            s1 (np.array(,dtype=np.float64)): values of samples
98            s2 (np.array(,dtype=np.float64)): values of samples
99
100        Returns:
101            s1new (np.array(,dtype=np.float64)): coordinates of samples after reduction
102            s2new (np.array(,dtype=np.float64)): coordinates of samples after reduction
103        """
104        lens = len(s1)
105        indices = []
106        for i in range(lens):
107            if s1[i]==0 and s2[i]==0:
108                    indices.append(i)
109
110        return np.delete(s1, indices), np.delete(s2, indices)
111
112
113    def reduceEmptySignatures_Density(self,s1,s2):
114        """Removes samples from signatures if corresponding samples for both models have weight 0.
115        Args:
116            s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
117            s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
118
119        Returns:
120            s1new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction
121            s2new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction
122        """
123        lens = len(s1[0])
124        indices = []
125        for i in range(lens):
126            if s1[1][i]==0 and s2[1][i]==0:
127                indices.append(i)
128
129        s1 = [np.delete(s1[0], indices, axis=0), np.delete(s1[1], indices, axis=0)]
130        s2 = [np.delete(s2[0], indices, axis=0), np.delete(s2[1], indices, axis=0)]
131        return s1, s2
132
133
134    def getSignatures(self,array,edges3,steps3):
135        """Generates signature for array representing the Model. Signature is composed of list of points [x,y,z] (float) and list of weights (int).
136
137        Args:
138            array (np.array(np.array(,dtype=float))): array with voxels representing the Model
139            edges3 ([np.array(,dtype=float),np.array(,dtype=float),np.array(,dtype=float)]): lists with edges for each step for each axis in order x,y,z
140            steps3 ([float,float,float]): [size of interval for x axis, size of interval for y axis, size of interval for y axis]
141
142        Returns (distribution):
143           signature [np.array(,dtype=np.float64),np.array(,dtype=np.float64)]: returns signatuere [np.array of points, np.array of weights]
144        Returns (frequency):
145           signature np.array(,dtype=np.float64): returns signatuere np.array of coefficients
146        """
147        edges_x,edges_y,edges_z = edges3
148        step_x,step_y,step_z=steps3
149        feature_array = []
150        weight_array = []
151        step_x_half = step_x/2
152        step_y_half = step_y/2
153        step_z_half = step_z/2
154        for x in range(len(edges_x[:-1])):
155            for y in range(len(edges_y[:-1])) :
156                for z in range(len(edges_z[:-1])):
157                    rows=np.where((array[:,0]> edges_x[x]) &
158                                  (array[:,0]<= edges_x[x+1]) &
159                                  (array[:,1]> edges_y[y]) &
160                                  (array[:,1]<= edges_y[y+1]) &
161                                  (array[:,2]> edges_z[z]) &
162                                  (array[:,2]<= edges_z[z+1]))
163                    if self.frequency:
164                        feature_array.append(len(array[rows]))
165                    else:
166                        weight, point = self.calculateNeighberhood(array[rows],[edges_x[x]+step_x_half,edges_y[y]+step_y_half,edges_z[z]+step_z_half])
167                        feature_array.append(point)
168                        weight_array.append(weight)
169
170        if self.frequency:
171            samples = np.array(feature_array,dtype=np.float64)
172            return abs(np.fft.fft(samples))
173        else:
174            return [np.array(feature_array,dtype=np.float64), np.array(weight_array,dtype=np.float64)]
175
176
177    def getSignaturesForPair(self,array1,array2):
178        """Generates signatures for given pair of models represented by array of voxels.
179        We calculate space for given models by taking the extremas for each axis and dividing the space by the resolution.
180        This divided space generate us samples which contains points. Each sample will have new coordinates which are mean of all points from it and weight which equals to the number of points.
181       
182        Args:
183            array1 (np.array(np.array(,dtype=float))): array with voxels representing model1
184            array2 (np.array(np.array(,dtype=float))): array with voxels representing model2
185
186        Returns:
187            s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
188            s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
189        """
190
191        min_x = np.min([np.min(array1[:,0]),np.min(array2[:,0])])
192        max_x = np.max([np.max(array1[:,0]),np.max(array2[:,0])])
193        min_y = np.min([np.min(array1[:,1]),np.min(array2[:,1])])
194        max_y = np.max([np.max(array1[:,1]),np.max(array2[:,1])])
195        min_z = np.min([np.min(array1[:,2]),np.min(array2[:,2])])
196        max_z = np.max([np.max(array1[:,2]),np.max(array2[:,2])])
197
198        # We request self.resolution+1 samples since we need self.resolution intervals
199        edges_x,step_x = np.linspace(min_x,max_x,self.resolution+1,retstep=True)
200        edges_y,step_y = np.linspace(min_y,max_y,self.resolution+1,retstep=True)
201        edges_z,step_z = np.linspace(min_z,max_z,self.resolution+1,retstep=True)
202       
203        for edges in (edges_x, edges_y, edges_z):  # EPSILON subtracted to deal with boundary voxels (one-sided open intervals and comparisons in loops in function getSignatures())
204            edges[0] -= self.EPSILON
205
206        edges3 = (edges_x,edges_y,edges_z)
207        steps3 = (step_x,step_y,step_z)
208       
209        s1 = self.getSignatures(array1,edges3,steps3)
210        s2 = self.getSignatures(array2,edges3,steps3)   
211       
212        return s1,s2
213
214
215    def getVoxels(self,geno):
216        """Generates voxels for genotype using frams.ModelGeometry
217
218        Args:
219            geno (string): representation of Model in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
220
221        Returns:
222            np.array([np.array(,dtype=float)]: list of voxels representing the Model.
223        """
224        model = self.frams.Model.newFromString(geno)
225        align(model, self.fixedZaxis)
226        model_geometry = self.frams.ModelGeometry.forModel(model)
227
228        model_geometry.geom_density = self.density
229        voxels = np.array([np.array([p.x._value(),p.y._value(),p.z._value()]) for p in model_geometry.voxels()])
230        return voxels
231
232
233    def calculateDissimforVoxels(self, voxels1, voxels2):
234        """Calculates EMD for pair of voxels representing models.
235        Args:
236            voxels1 np.array([np.array(,dtype=float)]: list of voxels representing model1.
237            voxels2 np.array([np.array(,dtype=float)]: list of voxels representing model2.
238
239        Returns:
240            float: dissim for pair of list of voxels
241        """
242        numvox1 = len(voxels1)
243        numvox2 = len(voxels2)   
244
245        s1, s2 = self.getSignaturesForPair(voxels1, voxels2)
246
247        if numvox1 != sum(s1[1]) or numvox2 != sum(s2[1]):
248            print("Bad signature!")
249            print("Base voxels fig1: ", numvox1, " fig2: ", numvox2)
250            print("After reduction voxels fig1: ", sum(s1[1]), " fig2: ", sum(s2[1]))
251            raise RuntimeError("Bad signature!")
252
253        reduce_fun = self.reduceEmptySignatures_Frequency if self.frequency else self.reduceEmptySignatures_Density
254        if self.reduce_empty:
255            s1, s2 = reduce_fun(s1,s2)
256
257            if not self.frequency:
258                if numvox1 != sum(s1[1]) or numvox2 != sum(s2[1]):
259                    print("Voxel reduction didn't work properly")
260                    print("Base voxels fig1: ", numvox1, " fig2: ", numvox2)
261                    print("After reduction voxels fig1: ", sum(s1[1]), " fig2: ", sum(s2[1]))
262                    raise RuntimeError("Voxel reduction error!")
263       
264        if self.metric == 'l1':
265            if self.frequency:
266                out = np.linalg.norm((s1-s2), ord=1)
267            else:
268                out = np.linalg.norm((s1[1]-s2[1]), ord=1)
269
270        elif self.metric == 'l2':
271            if self.frequency:
272                out = np.linalg.norm((s1-s2))
273            else:
274                out = np.linalg.norm((s1[1]-s2[1]))
275
276        elif self.metric == 'emd':
277            if self.frequency:
278                num_points = len(s1)
279                dist_matrix = self.calculateDistanceMatrix(range(num_points),range(num_points))
280            else:
281                dist_matrix = self.calculateDistanceMatrix(s1[0],s2[0])
282
283            self.libm.fedisableexcept(0x04)  # change default flag value - don't cause exceptions when dividing by 0 (pyemd does it)
284
285            if self.frequency:
286                out = emd(s1,s2,np.array(dist_matrix,dtype=np.float64))
287            else:
288                out = emd(s1[1],s2[1],dist_matrix)
289
290            self.libm.feclearexcept(0x04) # restoring default flag values...
291            self.libm.feenableexcept(0x04)
292
293        else:
294            raise ValueError("Wrong metric '%s'"%self.metric)
295
296        return out
297
298
299    def calculateDissimforGeno(self, geno1, geno2):
300        """Calculates EMD for a pair of genotypes.
301        Args:
302            geno1 (string): representation of model1 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
303            geno2 (string): representation of model2 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
304
305        Returns:
306            float: dissim for pair of strings representing models.
307        """     
308
309        voxels1 = self.getVoxels(geno1)
310        voxels2 = self.getVoxels(geno2)
311
312        out = self.calculateDissimforVoxels(voxels1, voxels2)
313
314        if self.verbose == True:
315            print("Intervals: ", self.resolution)
316            print("Geno1:\n",geno1)
317            print("Geno2:\n",geno2)
318            print("EMD:\n",out)
319
320        return out
321
322
323    def getDissimilarityMatrix(self,listOfGeno):
324        """
325        Args:
326            listOfGeno ([string]): list of strings representing genotypes in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
327
328        Returns:
329            np.array(np.array(,dtype=float)): dissimilarity matrix of EMD for given list of genotypes
330        """
331        numOfGeno = len(listOfGeno)
332        dissimMatrix = np.zeros(shape=[numOfGeno,numOfGeno])
333        listOfVoxels = [self.getVoxels(g) for g in listOfGeno]
334        for i in range(numOfGeno):
335            for j in range(numOfGeno):
336                dissimMatrix[i,j] = self.calculateDissimforVoxels(listOfVoxels[i], listOfVoxels[j])
337        return dissimMatrix
Note: See TracBrowser for help on using the repository browser.