source: framspy/dissimilarity/density-distribution.py @ 1277

Last change on this file since 1277 was 1269, checked in by oriona, 17 months ago

FFT is now calculated after the signature reduction, to increase efficiency.

File size: 15.7 KB
Line 
1import numpy as np
2from pyemd import emd
3from ctypes import cdll
4from ctypes.util import find_library
5from alignmodel import align
6
7class DensityDistribution:
8    """Two dissimilarity measures based on the spatial distribution of two Models. The Model bounding box is divided into a grid of equally-sized cuboids, the number of which is the 'resolution' parameter cubed. Then the Model surface is covered with points; the density of the surface sampling is determined by the 'density' parameter. There are two versions of the measure. In the default version ('frequency'=False), a signature of each cuboid is the centroid and the number of samples. In the 'frequency'=True version, FFT is computed from the vector containing the number of samples in each cuboid. The final result of the dissimilarity measure is the distance between the signatures and it can be computed using EMD, L1, or L2 norms (the 'metric' parameter).
9    """
10   
11    libm = cdll.LoadLibrary(find_library('m')) # for disabling/enabling floating point exceptions (division by zero occurs in the EMD library)
12    EPSILON = 0.0001
13   
14    def __init__(self, frams_module=None, density = 10, resolution = 8, reduce_empty=True, frequency=False, metric = 'emd', fixedZaxis=False, verbose=False):
15        """ __init__
16        Args:
17            density (int, optional): density of samplings for frams.ModelGeometry. Defaults to 10.
18            resolution (int, optional): How many intervals are used in each dimension to partition surface samples of Models in the 3D space.
19                The higher the value, the more detailed the comparison and the longer the calculations. Defaults to 3.
20            reduce_empty (bool, optional): If we should use reduction to remove blank samples. Defaults to True.
21            frequency (bool, optional): If we should use frequency distribution. Defaults to False.
22            metric (string, optional): The distance metric that should be used ('emd', 'l1', or 'l2'). Defaults to 'emd'.
23            fixedZaxis (bool, optional): If the z axis should be fixed during alignment. Defaults to False.
24            verbose (bool, optional): Turning on logging, works only for calculateEMDforGeno. Defaults to False.           
25        """
26        if frams_module is None:
27            raise ValueError('Framsticks module not provided!')
28        self.frams = frams_module
29
30        self.density = density
31        self.resolution = resolution
32        self.verbose = verbose
33        self.reduce_empty = reduce_empty
34        self.frequency = frequency
35        self.metric = metric
36        self.fixedZaxis = fixedZaxis
37
38
39    def calculateNeighberhood(self,array,mean_coords):
40        """ Calculates number of elements for given sample and set ups the center of this sample
41        to the center of mass (calculated by mean of every coordinate)
42        Args:
43            array ([[float,float,float],...,[float,float,float]]): array of voxels that belong to given sample.
44            mean_coords ([float,float,float]): default coordinates that are the
45                middle of the sample (used when number of voxels in sample is equal to 0)
46
47        Returns:
48            weight [int]: number of voxels in a sample
49            coordinates [float,float,float]: center of mass for a sample
50        """
51        weight = len(array)
52        if weight > 0:
53            point = [np.mean(array[:,0]),np.mean(array[:,1]),np.mean(array[:,2])]
54            return weight, point
55        else:
56            return 0, mean_coords
57
58
59    def calculateDistPoints(self,point1, point2):
60        """ Returns euclidean distance between two points
61        Args (distribution):
62            point1 ([float,float,float]) - coordinates of first point
63            point2 ([float,float,float]) - coordinates of second point
64        Args (frequency):
65            point1 (float) - value of the first sample
66            point2 (float) - value of the second sample
67
68        Returns:
69            [float]: euclidean distance
70        """
71        if self.frequency:
72            return abs(point1-point2)
73        else:
74            return np.sqrt(np.sum(np.square(point1-point2)))
75
76
77    def calculateDistanceMatrix(self,array1, array2):
78        """
79        Args:
80            array1 ([type]): array of size n with points representing the first Model
81            array2 ([type]): array of size n with points representing the second Model
82
83        Returns:
84            np.array(np.array(,dtype=float)): distance matrix n*n
85        """
86        n = len(array1)
87        distMatrix = np.zeros((n,n))
88        for i in range(n):
89            for j in range(n):
90                distMatrix[i][j] = self.calculateDistPoints(array1[i], array2[j])
91        return np.array(distMatrix)
92
93
94    def reduceEmptySignatures_Frequency(self,s1,s2):
95        """Removes samples from signatures if corresponding samples for both models have weight 0.
96        Args:
97            s1 (np.array(,dtype=np.float64)): values of samples
98            s2 (np.array(,dtype=np.float64)): values of samples
99
100        Returns:
101            s1new (np.array(,dtype=np.float64)): coordinates of samples after reduction
102            s2new (np.array(,dtype=np.float64)): coordinates of samples after reduction
103        """
104        lens = len(s1)
105        indices = []
106        for i in range(lens):
107            if s1[i]==0 and s2[i]==0:
108                    indices.append(i)
109
110        return np.delete(s1, indices), np.delete(s2, indices)
111
112
113    def reduceEmptySignatures_Density(self,s1,s2):
114        """Removes samples from signatures if corresponding samples for both models have weight 0.
115        Args:
116            s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
117            s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
118
119        Returns:
120            s1new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction
121            s2new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction
122        """
123        lens = len(s1[0])
124        indices = []
125        for i in range(lens):
126            if s1[1][i]==0 and s2[1][i]==0:
127                indices.append(i)
128
129        s1 = [np.delete(s1[0], indices, axis=0), np.delete(s1[1], indices, axis=0)]
130        s2 = [np.delete(s2[0], indices, axis=0), np.delete(s2[1], indices, axis=0)]
131        return s1, s2
132
133
134    def getSignatures(self,array,edges3,steps3):
135        """Generates signature for array representing the Model. Signature is composed of list of points [x,y,z] (float) and list of weights (int).
136
137        Args:
138            array (np.array(np.array(,dtype=float))): array with voxels representing the Model
139            edges3 ([np.array(,dtype=float),np.array(,dtype=float),np.array(,dtype=float)]): lists with edges for each step for each axis in order x,y,z
140            steps3 ([float,float,float]): [size of interval for x axis, size of interval for y axis, size of interval for y axis]
141
142        Returns (distribution):
143           signature [np.array(,dtype=np.float64),np.array(,dtype=np.float64)]: returns signatuere [np.array of points, np.array of weights]
144        Returns (frequency):
145           signature np.array(,dtype=np.float64): returns signatuere np.array of coefficients
146        """
147        edges_x,edges_y,edges_z = edges3
148        step_x,step_y,step_z=steps3
149        feature_array = []
150        weight_array = []
151        step_x_half = step_x/2
152        step_y_half = step_y/2
153        step_z_half = step_z/2
154        for x in range(len(edges_x[:-1])):
155            for y in range(len(edges_y[:-1])) :
156                for z in range(len(edges_z[:-1])):
157                    rows=np.where((array[:,0]> edges_x[x]) &
158                                  (array[:,0]<= edges_x[x+1]) &
159                                  (array[:,1]> edges_y[y]) &
160                                  (array[:,1]<= edges_y[y+1]) &
161                                  (array[:,2]> edges_z[z]) &
162                                  (array[:,2]<= edges_z[z+1]))
163                    if self.frequency:
164                        feature_array.append(len(array[rows]))
165                    else:
166                        weight, point = self.calculateNeighberhood(array[rows],[edges_x[x]+step_x_half,edges_y[y]+step_y_half,edges_z[z]+step_z_half])
167                        feature_array.append(point)
168                        weight_array.append(weight)
169
170        if self.frequency:
171            samples = np.array(feature_array,dtype=np.float64)
172            return samples
173        else:
174            return [np.array(feature_array,dtype=np.float64), np.array(weight_array,dtype=np.float64)]
175
176
177    def getSignaturesForPair(self,array1,array2):
178        """Generates signatures for given pair of models represented by array of voxels.
179        We calculate space for given models by taking the extremas for each axis and dividing the space by the resolution.
180        This divided space generate us samples which contains points. Each sample will have new coordinates which are mean of all points from it and weight which equals to the number of points.
181       
182        Args:
183            array1 (np.array(np.array(,dtype=float))): array with voxels representing model1
184            array2 (np.array(np.array(,dtype=float))): array with voxels representing model2
185
186        Returns:
187            s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
188            s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
189        """
190
191        min_x = np.min([np.min(array1[:,0]),np.min(array2[:,0])])
192        max_x = np.max([np.max(array1[:,0]),np.max(array2[:,0])])
193        min_y = np.min([np.min(array1[:,1]),np.min(array2[:,1])])
194        max_y = np.max([np.max(array1[:,1]),np.max(array2[:,1])])
195        min_z = np.min([np.min(array1[:,2]),np.min(array2[:,2])])
196        max_z = np.max([np.max(array1[:,2]),np.max(array2[:,2])])
197
198        # We request self.resolution+1 samples since we need self.resolution intervals
199        edges_x,step_x = np.linspace(min_x,max_x,self.resolution+1,retstep=True)
200        edges_y,step_y = np.linspace(min_y,max_y,self.resolution+1,retstep=True)
201        edges_z,step_z = np.linspace(min_z,max_z,self.resolution+1,retstep=True)
202       
203        for edges in (edges_x, edges_y, edges_z):  # EPSILON subtracted to deal with boundary voxels (one-sided open intervals and comparisons in loops in function getSignatures())
204            edges[0] -= self.EPSILON
205
206        edges3 = (edges_x,edges_y,edges_z)
207        steps3 = (step_x,step_y,step_z)
208       
209        s1 = self.getSignatures(array1,edges3,steps3)
210        s2 = self.getSignatures(array2,edges3,steps3)   
211       
212        return s1,s2
213
214
215    def getVoxels(self,geno):
216        """Generates voxels for genotype using frams.ModelGeometry
217
218        Args:
219            geno (string): representation of Model in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
220
221        Returns:
222            np.array([np.array(,dtype=float)]: list of voxels representing the Model.
223        """
224        model = self.frams.Model.newFromString(geno)
225        align(model, self.fixedZaxis)
226        model_geometry = self.frams.ModelGeometry.forModel(model)
227
228        model_geometry.geom_density = self.density
229        voxels = np.array([np.array([p.x._value(),p.y._value(),p.z._value()]) for p in model_geometry.voxels()])
230        return voxels
231
232
233    def normalize(self, signature):
234        """Normalizes the signature values by dividing each element by the sum of all elements.
235        Args:
236            signature np.array(,dtype=float): A one-dimensional array of signature values.
237
238        Returns:
239            np.array(,dtype=float): A one-dimensional array of normalized signature values.
240        """
241        total = np.sum(signature)
242        return np.divide(signature, total)
243
244
245    def calculateDissimforVoxels(self, voxels1, voxels2):
246        """Calculates EMD for pair of voxels representing models.
247        Args:
248            voxels1 np.array([np.array(,dtype=float)]: list of voxels representing model1.
249            voxels2 np.array([np.array(,dtype=float)]: list of voxels representing model2.
250
251        Returns:
252            float: dissim for pair of list of voxels
253        """
254        numvox1 = len(voxels1)
255        numvox2 = len(voxels2)   
256
257        s1, s2 = self.getSignaturesForPair(voxels1, voxels2)
258
259        reduce_fun = self.reduceEmptySignatures_Frequency if self.frequency else self.reduceEmptySignatures_Density
260        if self.reduce_empty:
261            s1, s2 = reduce_fun(s1,s2)
262
263            if not self.frequency:
264                if numvox1 != sum(s1[1]) or numvox2 != sum(s2[1]):
265                    print("Voxel reduction didn't work properly")
266                    print("Base voxels fig1: ", numvox1, " fig2: ", numvox2)
267                    print("After reduction voxels fig1: ", sum(s1[1]), " fig2: ", sum(s2[1]))
268                    raise RuntimeError("Voxel reduction error!")
269       
270        if self.frequency:
271           s1 = abs(np.fft.fft(s1))
272           s2 = abs(np.fft.fft(s2))
273       
274        if self.metric == 'l1':
275            if self.frequency:
276                out = np.linalg.norm((s1-s2), ord=1)
277            else:
278                out = np.linalg.norm((s1[1]-s2[1]), ord=1)
279
280        elif self.metric == 'l2':
281            if self.frequency:
282                out = np.linalg.norm((s1-s2))
283            else:
284                out = np.linalg.norm((s1[1]-s2[1]))
285
286        elif self.metric == 'emd':
287            if self.frequency:
288                num_points = np.linspace(0, 1, len(s1), True)
289                dist_matrix = self.calculateDistanceMatrix(num_points,num_points)
290            else:
291                dist_matrix = self.calculateDistanceMatrix(s1[0],s2[0])
292
293            self.libm.fedisableexcept(0x04)  # change default flag value - don't cause exceptions when dividing by 0 (pyemd does it)
294
295            if self.frequency:
296                out = emd(self.normalize(s1),self.normalize(s2),np.array(dist_matrix,dtype=np.float64))
297            else:
298                out = emd(self.normalize(s1[1]),self.normalize(s2[1]),dist_matrix)
299
300            self.libm.feclearexcept(0x04) # restoring default flag values...
301            self.libm.feenableexcept(0x04)
302
303        else:
304            raise ValueError("Wrong metric '%s'"%self.metric)
305
306        return out
307
308
309    def calculateDissimforGeno(self, geno1, geno2):
310        """Calculates EMD for a pair of genotypes.
311        Args:
312            geno1 (string): representation of model1 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
313            geno2 (string): representation of model2 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
314
315        Returns:
316            float: dissim for pair of strings representing models.
317        """     
318
319        voxels1 = self.getVoxels(geno1)
320        voxels2 = self.getVoxels(geno2)
321
322        out = self.calculateDissimforVoxels(voxels1, voxels2)
323
324        if self.verbose == True:
325            print("Intervals: ", self.resolution)
326            print("Geno1:\n",geno1)
327            print("Geno2:\n",geno2)
328            print("EMD:\n",out)
329
330        return out
331
332
333    def getDissimilarityMatrix(self,listOfGeno):
334        """
335        Args:
336            listOfGeno ([string]): list of strings representing genotypes in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
337
338        Returns:
339            np.array(np.array(,dtype=float)): dissimilarity matrix of EMD for given list of genotypes
340        """
341        numOfGeno = len(listOfGeno)
342        dissimMatrix = np.zeros(shape=[numOfGeno,numOfGeno])
343        listOfVoxels = [self.getVoxels(g) for g in listOfGeno]
344        for i in range(numOfGeno):
345            for j in range(numOfGeno):
346                dissimMatrix[i,j] = self.calculateDissimforVoxels(listOfVoxels[i], listOfVoxels[j])
347        return dissimMatrix
Note: See TracBrowser for help on using the repository browser.