1 '''
2 'prune.py' provides all the functions required to prune a set of BPMs after
3 they have been generated.
4 '''
5 from functools import partial
6 from itertools import combinations, product
7
8 from bpm import conf, geneinter, parallel
9
11 '''
12 After all BPMs are generated, two different pruning mechanisms are applied.
13
14 The first is pruning all BPMs that have a module less than the minimum size
15 or greater than the maximum size. If either is 0, then the pruning for that
16 constraint is skipped.
17
18 The second pruning mechanism is more complex. Essentially, the interaction
19 weight of each BPM is calculated (see 'interweight') and the list of BPMs
20 are then sorted by that interaction weight in descending order. Starting
21 from the beginning, BPMs are then added to final set of BPMs if and only if
22 its Jaccard index with every BPM already in the final set is less than
23 the threshold.
24 '''
25 if conf.min_size > 0 or conf.max_size > 0:
26 bpms = filter(lambda (A, B): satisfy_min_max(A, B), bpms)
27
28
29 if not conf.pruning:
30 return bpms
31
32 withI = parallel.pmap(interweight, bpms)
33 withI = sorted(withI, key=lambda (iw, (A, B)): iw, reverse=True)
34
35 pruned = []
36 for iw, (A, B) in withI:
37 jind = partial(jaccard_index, A.union(B))
38 if all(map(lambda ji: ji < conf.jaccard,
39 [jind(S1.union(S2)) for S1, S2 in pruned])):
40 pruned.append((A, B))
41
42 return pruned
43
45 '''
46 Calculates the interaction weight of a BPM. It is defined as the difference
47 of sums of interaction scores within each module and the sum of interaction
48 scores between each module divided by the number of genes in the entire BPM.
49
50 The value returned is a BPM "decorated" with the interaction weight for
51 sorting purposes. This roundabout means of decoration is used so that
52 parallelism can be used for calculating the interaction weights. (As
53 opposed to using a higher order function with 'sorted'.)
54 '''
55
56 gitup = lambda (g1, g2): geneinter.gi(g1, g2)
57
58 def sum_within(S):
59 return sum(map(gitup, combinations(S, 2)))
60
61 within = sum_within(A) + sum_within(B)
62 between = sum(map(gitup, product(A, B)))
63
64 iweight = (within - between) / float(len(A) + len(B))
65
66 return (iweight, (A, B))
67
69 '''
70 The Jaccard index of a BPM: the number of genes in the intersection of bpm1
71 and bpm2 divided by the number of genes in the union of bpm1 and bpm2.
72
73 A BPM is simply the union of its corresponding modules.
74 '''
75 return len(bpm1.intersection(bpm2)) / float(len(bpm1.union(bpm2)))
76
78 '''
79 Whether a BPM's modules BOTH satisfy the minimum size constraint.
80 '''
81 return len(A) >= conf.min_size and len(B) >= conf.min_size
82
84 '''
85 Whether a BPM's modules BOTH satisfy the maximum size constraint.
86 '''
87 return len(A) <= conf.max_size and len(B) <= conf.max_size
88
90 '''
91 Whether a BPM's modules satisfy both the min and max size constraints.
92 (Including if the constraints are disabled.)
93 '''
94 return ((conf.min_size == 0 or constraint_min(A, B))
95 and
96 (conf.max_size == 0 or constraint_max(A, B)))
97