bpm.partition

1 ''' 2 'partition.py' is a module that parallelizes the process of partitioning random 3 sets of genes. This is where most of the complexity of Genecentric is, 4 particularly in the 'localmaxcut' function. 5 ''' 6 import random 7 8 from bpm import conf, geneinter, parallel 9 10 # See notes in 'bpms' for why this is global. 11 happyparts = None 12

13 -def bpms():

14 ''' 15 Generates a list of happy bipartitions in parallel and then 16 generates a list of BPMs in parallel. 17 ''' 18 19 # This global variable is bad in principle but seems to be required 20 # for parallelism to be effective with 'group_genes'. I was currying 21 # 'happyparts' with group_genes, but for whatever reason, this stopped 22 # multiprocessing from keeping all of the cores hot. 23 global happyparts 24 25 happyparts = parallel.pmap(localmaxcut, xrange(0, conf.M)) 26 return parallel.pmap(group_genes, enumerate(geneinter.genes))

27

28 -def group_genes((i, g1)):

29 ''' 30 group_genes is applied to every gene, and a BPM is generated from *every* 31 gene. In particular, given M happy bipartitions, generate a BPM where 32 the first module contains all genes that appeared in the same set in the M 33 bipartitions C% of the time and the second module contains all genes 34 that appeared in the opposite set in the M bipartitions C% of the time. 35 ''' 36 mod1, mod2 = [], [] 37 38 for g2 in geneinter.genes: 39 # Count the number of times g2 is in the same set as g2 40 freqsame = sum([1 for A, B in happyparts 41 if (g1 in A and g2 in A) or (g1 in B and g2 in B)]) 42 43 ratio = float(freqsame) / conf.M 44 if ratio >= conf.C: 45 mod1.append(g2) 46 elif (1 - ratio) >= conf.C: 47 mod2.append(g2) 48 49 parallel.inc_counter() 50 parallel.print_progress() 51 52 return set(mod1), set(mod2)

53

54 -def localmaxcut(m):

55 ''' 56 Generates a random bipartition and makes the bipartition 'happy' by 57 applying 'Weighted-Flip' (from Leiserson et al., 2011) until there are no 58 unhappy genes left. 59 ''' 60 A, B = random_bipartition() 61 62 same_set = lambda g1, g2: (g1 in A and g2 in A) or (g1 in B and g2 in B) 63 def weights(g1): 64 ''' 65 Calculates the total neighboring weight of 'g1'. The total 66 neighboring weight is a tuple of the sum of interactions in the same 67 set as g1 and the sum of interactions in the opposite set as g1. 68 69 The tuple in this case is represented by a dictionary with keys 70 'same' and 'other'. I'm using a dictionary because the values need 71 to be mutable; they change as we move vertices between the partitions. 72 ''' 73 ws = { 'same': 0, 'other': 0 } 74 for g2 in geneinter.genes: 75 w = geneinter.gi(g1, g2) 76 if same_set(g1, g2): 77 ws['same'] += w 78 else: 79 ws['other'] += w 80 return ws

81 82 nweights = { g: weights(g) for g in geneinter.genes } 83 unhappy = get_unhappy(nweights) 84 85 while unhappy: 86 v = random.choice(unhappy) 87 88 if v in A: 89 A.remove(v) 90 B.add(v) 91 else: 92 A.add(v) 93 B.remove(v) 94 95 # This loop eliminates the need to recalculate 'weights' for every 96 # gene again, which is O(n^2) in the number of genes. This loop is 97 # O(n) but comes at the cost of clarity. 98 # 99 # The idea is to modify the weights of every other interacting gene and 100 # to switch the 'same' and 'other' scores of the gene that was made 101 # happy. 102 for g, nw in nweights.iteritems(): 103 if g == v: 104 nw['same'], nw['other'] = nw['other'], nw['same'] 105 continue 106 107 # The interaction score between this gene and the gene that 108 # was made happy. 109 w = geneinter.gi(v, g) 110 111 # If the two genes are now in the same set, then 'g' gets a boost 112 # to its happiness. Otherwise, 'g' becomes more unhappy. 113 if same_set(v, g): 114 nw['same'] += w 115 nw['other'] -= w 116 else: 117 nw['same'] -= w 118 nw['other'] += w 119 120 # Refresh the unhappy list 121 unhappy = get_unhappy(nweights) 122 123 parallel.inc_counter() 124 parallel.print_progress() 125 126 return A, B 127

128 -def get_unhappy(nweights):

129 ''' 130 Returns all of the genes that are unhappy given a dictionary of 131 gene ids mapped to its total neighboring weights. 132 ''' 133 return [ g for (g, _) in filter(lambda (g, w): w['same'] < w['other'], 134 nweights.items()) ]

135

136 -def random_bipartition():

137 ''' 138 Creates two random sets of genes from the genetic interaction data. 139 ''' 140 A, B = set(), set() 141 for g in geneinter.genes: 142 if random.random() < 0.5: 143 A.add(g) 144 else: 145 B.add(g) 146 147 return A, B

148

Source Code for Module bpm.partition