source: framspy/dissimilarity/density_distribution.py @ 1334

Last change on this file since 1334 was 1329, checked in by Maciej Komosinski, 4 weeks ago

Cosmetic

File size: 16.3 KB
Line 
1import numpy as np
2from pyemd import emd
3from ctypes import cdll
4from ctypes.util import find_library
5from .alignmodel import align
6
7class DensityDistribution:
8    """Two dissimilarity measures based on the spatial distribution of two Models. The Model bounding box is divided into a grid of equally-sized cuboids, the number of which is the 'resolution' parameter cubed. Then the Model surface is covered with points; the density of the surface sampling is determined by the 'density' parameter. There are two versions of the measure. In the default version ('frequency'=False), the signature of each cuboid is the centroid and the number of samples. In the 'frequency'=True version, FFT is computed from the vector containing the number of samples in each cuboid. The final result of the dissimilarity measure is the distance between the signatures and it can be computed using EMD, L1, or L2 norms (the 'metric' parameter).
9    """
10
11    libm = find_library('m')  # for disabling/enabling floating point exceptions (division by zero occurs in the pyemd library)
12    if libm is not None:  # libm.so (the mathematical library) is a part of Linux ecosystem - always present
13        libm = cdll.LoadLibrary(libm)
14    else:
15        print('\nWarning: The "m" library not found - floating point exceptions in pyemd may occur...\n') # but on Windows, pyemd does not seem to cause floating point exceptions
16    EPSILON = 0.0001
17   
18    def __init__(self, frams_module=None, density = 10, resolution = 8, reduce_empty=True, frequency=False, metric = 'emd', fixedZaxis=False, verbose=False):
19        """ __init__
20        Args:
21            density (int, optional): density of samplings for frams.ModelGeometry. Defaults to 10.
22            resolution (int, optional): How many intervals are used in each dimension to partition surface samples of Models in the 3D space.
23                The higher the value, the more detailed the comparison and the longer the calculations. Defaults to 3.
24            reduce_empty (bool, optional): If we should use reduction to remove blank samples. Defaults to True.
25            frequency (bool, optional): If we should use frequency distribution. Defaults to False.
26            metric (string, optional): The distance metric that should be used ('emd', 'l1', or 'l2'). Defaults to 'emd'.
27            fixedZaxis (bool, optional): If the z axis should be fixed during alignment. Defaults to False.
28            verbose (bool, optional): Turning on logging, works only for calculateEMDforGeno. Defaults to False.           
29        """
30        if frams_module is None:
31            raise ValueError('Framsticks module not provided!')
32        self.frams = frams_module
33
34        self.density = density
35        self.resolution = resolution
36        self.verbose = verbose
37        self.reduce_empty = reduce_empty
38        self.frequency = frequency
39        self.metric = metric
40        self.fixedZaxis = fixedZaxis
41
42
43    def calculateNeighborhood(self,array,mean_coords):
44        """ Calculates number of elements for given sample and set ups the center of this sample
45        to the center of mass (calculated by mean of every coordinate).
46        Args:
47            array ([[float,float,float],...,[float,float,float]]): array of voxels that belong to given sample.
48            mean_coords ([float,float,float]): default coordinates that are the
49                middle of the sample (used when number of voxels in sample is equal to 0)
50
51        Returns:
52            weight [int]: number of voxels in a sample
53            coordinates [float,float,float]: center of mass for a sample
54        """
55        weight = len(array)
56        if weight > 0:
57            point = np.mean(array, axis=0)  # equivalent to [np.mean(array[:,0]),np.mean(array[:,1]),np.mean(array[:,2])]
58            return weight, point
59        else:
60            return 0, mean_coords
61
62
63    def calculateDistPoints(self, point1, point2):
64        """ Returns Euclidean distance between two samples (1D for frequency) or points (3D for distribution).
65        Args (frequency):
66            point1 (float) - value of the first sample
67            point2 (float) - value of the second sample
68        Args (distribution):
69            point1 ([float,float,float]) - coordinates of the first point
70            point2 ([float,float,float]) - coordinates of the second point
71
72        Returns:
73            [float]: Euclidean distance
74        """
75        if self.frequency:
76            return abs(point1-point2)
77        else:
78            return np.linalg.norm(point1-point2, ord=2)
79
80
81    def calculateDistanceMatrix(self, array1, array2):
82        """
83        Args:
84            array1 ([type]): array of size n with points representing the first Model
85            array2 ([type]): array of size n with points representing the second Model
86
87        Returns:
88            np.array(np.array(,dtype=float)): distance matrix n*n
89        """
90        n = len(array1)
91        distMatrix = np.zeros((n,n))
92        for i in range(n):
93            for j in range(n):
94                distMatrix[i][j] = self.calculateDistPoints(array1[i], array2[j])
95        return np.array(distMatrix)
96
97
98    def reduceEmptySignatures_Frequency(self,s1,s2):
99        """Removes samples from signatures if corresponding samples for both models have weight 0.
100        Args:
101            s1 (np.array(,dtype=np.float64)): values of samples
102            s2 (np.array(,dtype=np.float64)): values of samples
103
104        Returns:
105            s1new (np.array(,dtype=np.float64)): coordinates of samples after reduction
106            s2new (np.array(,dtype=np.float64)): coordinates of samples after reduction
107        """
108        lens = len(s1)
109        indices = [i for i in range(lens) if s1[i]==0 and s2[i]==0]
110
111        return np.delete(s1, indices), np.delete(s2, indices)
112
113
114    def reduceEmptySignatures_Density(self,s1,s2):
115        """Removes samples from signatures if corresponding samples for both models have weight 0.
116        Args:
117            s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
118            s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
119
120        Returns:
121            s1new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction
122            s2new ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights] after reduction
123        """
124        lens = len(s1[0])
125        indices = [i for i in range(lens) if s1[1][i]==0 and s2[1][i]==0]
126
127        s1 = [np.delete(s1[0], indices, axis=0), np.delete(s1[1], indices, axis=0)]
128        s2 = [np.delete(s2[0], indices, axis=0), np.delete(s2[1], indices, axis=0)]
129        return s1, s2
130
131
132    def getSignatures(self,array,edges3,steps3):
133        """Generates signature for array representing the Model. Signature is composed of list of points [x,y,z] (float) and list of weights (int).
134
135        Args:
136            array (np.array(np.array(,dtype=float))): array with voxels representing the Model
137            edges3 ([np.array(,dtype=float),np.array(,dtype=float),np.array(,dtype=float)]): lists with edges for each step for each axis in order x,y,z
138            steps3 ([float,float,float]): [size of interval for x axis, size of interval for y axis, size of interval for y axis]
139
140        Returns (frequency):
141           signature np.array(,dtype=np.float64): returns signatuere np.array of coefficients
142        Returns (distribution):
143           signature [np.array(,dtype=np.float64),np.array(,dtype=np.float64)]: returns signatuere [np.array of points, np.array of weights]
144        """
145        edges_x,edges_y,edges_z = edges3
146        step_x,step_y,step_z=steps3
147        feature_array = []
148        weight_array = []
149        step_x_half = step_x/2
150        step_y_half = step_y/2
151        step_z_half = step_z/2
152        for x in range(len(edges_x[:-1])):
153            for y in range(len(edges_y[:-1])) :
154                for z in range(len(edges_z[:-1])):
155                    rows=np.where((array[:,0]> edges_x[x]) &
156                                  (array[:,0]<= edges_x[x+1]) &
157                                  (array[:,1]> edges_y[y]) &
158                                  (array[:,1]<= edges_y[y+1]) &
159                                  (array[:,2]> edges_z[z]) &
160                                  (array[:,2]<= edges_z[z+1]))
161                    if self.frequency:
162                        feature_array.append(len(array[rows]))
163                    else:
164                        weight, point = self.calculateNeighborhood(array[rows],[edges_x[x]+step_x_half,edges_y[y]+step_y_half,edges_z[z]+step_z_half])
165                        feature_array.append(point)
166                        weight_array.append(weight)
167
168        if self.frequency:
169            samples = np.array(feature_array,dtype=np.float64)
170            return samples
171        else:
172            return [np.array(feature_array,dtype=np.float64), np.array(weight_array,dtype=np.float64)]
173
174
175    def getSignaturesForPair(self,array1,array2):
176        """Generates signatures for a given pair of models represented by array of voxels.
177        We calculate space for given models by taking the extremas for each axis and dividing the space by the resolution.
178        This divided space generate us samples which contains points. Each sample will have new coordinates which are mean of all points from it and weight which equals to the number of points.
179       
180        Args:
181            array1 (np.array(np.array(,dtype=float))): array with voxels representing model1
182            array2 (np.array(np.array(,dtype=float))): array with voxels representing model2
183
184        Returns:
185            s1 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
186            s2 ([np.array(,dtype=np.float64),np.array(,dtype=np.float64)]): [coordinates of samples, weights]
187        """
188
189        min_x = np.min([np.min(array1[:,0]),np.min(array2[:,0])])
190        max_x = np.max([np.max(array1[:,0]),np.max(array2[:,0])])
191        min_y = np.min([np.min(array1[:,1]),np.min(array2[:,1])])
192        max_y = np.max([np.max(array1[:,1]),np.max(array2[:,1])])
193        min_z = np.min([np.min(array1[:,2]),np.min(array2[:,2])])
194        max_z = np.max([np.max(array1[:,2]),np.max(array2[:,2])])
195
196        # We request self.resolution+1 samples since we need self.resolution intervals
197        edges_x,step_x = np.linspace(min_x,max_x,self.resolution+1,retstep=True)
198        edges_y,step_y = np.linspace(min_y,max_y,self.resolution+1,retstep=True)
199        edges_z,step_z = np.linspace(min_z,max_z,self.resolution+1,retstep=True)
200       
201        for edges in (edges_x, edges_y, edges_z):  # EPSILON subtracted to deal with boundary voxels (one-sided open intervals and comparisons in loops in function getSignatures())
202            edges[0] -= self.EPSILON
203
204        edges3 = (edges_x,edges_y,edges_z)
205        steps3 = (step_x,step_y,step_z)
206       
207        s1 = self.getSignatures(array1,edges3,steps3)
208        s2 = self.getSignatures(array2,edges3,steps3)   
209       
210        return s1,s2
211
212
213    def getVoxels(self,geno):
214        """Generates voxels for genotype using frams.ModelGeometry.
215
216        Args:
217            geno (string): representation of Model in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
218
219        Returns:
220            np.array([np.array(,dtype=float)]: list of voxels representing the Model.
221        """
222        model = self.frams.Model.newFromString(geno)
223        align(model, self.fixedZaxis)
224        model_geometry = self.frams.ModelGeometry.forModel(model)
225
226        model_geometry.geom_density = self.density
227        voxels = np.array([np.array([p.x._value(),p.y._value(),p.z._value()]) for p in model_geometry.voxels()])
228        return voxels
229
230
231    def normalize(self, signature):
232        """Normalizes the signature values by dividing each element by the sum of all elements.
233        Args:
234            signature np.array(,dtype=float): A one-dimensional array of signature values.
235
236        Returns:
237            np.array(,dtype=float): A one-dimensional array of normalized signature values.
238        """
239        total = np.sum(signature)
240        return np.divide(signature, total)
241
242
243    def calculateDissimforVoxels(self, voxels1, voxels2):
244        """Calculates EMD for a pair of voxels representing models.
245        Args:
246            voxels1 np.array([np.array(,dtype=float)]: list of voxels representing model1.
247            voxels2 np.array([np.array(,dtype=float)]: list of voxels representing model2.
248
249        Returns:
250            float: dissim for a pair of list of voxels
251        """
252        numvox1 = len(voxels1)
253        numvox2 = len(voxels2)   
254
255        s1, s2 = self.getSignaturesForPair(voxels1, voxels2)
256
257        reduce_fun = self.reduceEmptySignatures_Frequency if self.frequency else self.reduceEmptySignatures_Density
258        if self.reduce_empty:
259            s1, s2 = reduce_fun(s1,s2)
260
261            if not self.frequency:
262                if numvox1 != sum(s1[1]) or numvox2 != sum(s2[1]):
263                    print("Voxel reduction didn't work properly")
264                    print("Base voxels fig1: ", numvox1, " fig2: ", numvox2)
265                    print("After reduction voxels fig1: ", sum(s1[1]), " fig2: ", sum(s2[1]))
266                    raise RuntimeError("Voxel reduction error!")
267       
268        if self.frequency:
269           s1 = abs(np.fft.fft(s1))
270           s2 = abs(np.fft.fft(s2))
271       
272        if self.metric == 'l1':
273            if self.frequency:
274                out = np.linalg.norm(s1-s2, ord=1)
275            else:
276                out = np.linalg.norm(s1[1]-s2[1], ord=1)
277
278        elif self.metric == 'l2':
279            if self.frequency:
280                out = np.linalg.norm(s1-s2)
281            else:
282                out = np.linalg.norm(s1[1]-s2[1])
283
284        elif self.metric == 'emd':
285            if self.frequency:
286                num_points = np.linspace(0, 1, len(s1), True)
287                dist_matrix = self.calculateDistanceMatrix(num_points,num_points)
288            else:
289                dist_matrix = self.calculateDistanceMatrix(s1[0],s2[0])
290
291            if self.libm is not None:
292                self.libm.fedisableexcept(0x04)  # change default flag value - don't cause "Floating point exception" when dividing by 0 (pyemd does that, for example when comparing two identical histograms, i.e., two identical signatures, for example from two identical phenotypes)
293
294            if self.frequency:
295                out = emd(self.normalize(s1),self.normalize(s2),np.array(dist_matrix,dtype=np.float64))
296            else:
297                out = emd(self.normalize(s1[1]),self.normalize(s2[1]),dist_matrix)
298
299            if self.libm is not None:
300                self.libm.feclearexcept(0x04)  # restoring default flag values...
301                self.libm.feenableexcept(0x04)
302
303        else:
304            raise ValueError("Wrong metric '%s'"%self.metric)
305
306        return out
307
308
309    def calculateDissimforGeno(self, geno1, geno2):
310        """Calculates EMD for a pair of genotypes.
311        Args:
312            geno1 (string): representation of model1 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
313            geno2 (string): representation of model2 in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
314
315        Returns:
316            float: dissim for a pair of strings representing models.
317        """     
318
319        voxels1 = self.getVoxels(geno1)
320        voxels2 = self.getVoxels(geno2)
321
322        out = self.calculateDissimforVoxels(voxels1, voxels2)
323
324        if self.verbose == True:
325            print("Intervals: ", self.resolution)
326            print("Geno1:\n",geno1)
327            print("Geno2:\n",geno2)
328            print("EMD:\n",out)
329
330        return out
331
332
333    def getDissimilarityMatrix(self,listOfGeno):
334        """
335        Args:
336            listOfGeno ([string]): list of strings representing genotypes in one of the formats supported by Framsticks, http://www.framsticks.com/a/al_genotype.html
337
338        Returns:
339            np.array(np.array(,dtype=float)): dissimilarity matrix of EMD for given list of genotypes
340        """
341        numOfGeno = len(listOfGeno)
342        dissimMatrix = np.zeros(shape=[numOfGeno,numOfGeno])
343        listOfVoxels = [self.getVoxels(g) for g in listOfGeno]
344        for i in range(numOfGeno):
345            for j in range(numOfGeno): # could only calculate a triangle if the definition of similarity and its calculation guarantees symmetry
346                dissimMatrix[i,j] = self.calculateDissimforVoxels(listOfVoxels[i], listOfVoxels[j])
347        return dissimMatrix
Note: See TracBrowser for help on using the repository browser.