1 '''
2 'geneint.py' is the module that processes the genetic interaction input file.
3 '''
4 from collections import defaultdict
5 import csv
6
7 from bpm import conf, gzipOpen, parallel
8
9 gis = defaultdict(float)
10 genes = set()
11 genespace = set()
12 numgenes = 0
13
14 -def load_genes(geneinter_file='', ignore_file='', squaring=True):
15 '''
16 Loads all of the gene pairs and their corresponding interaction scores
17 into memory. It also keeps a set of all genes for iterative purposes.
18
19 There is some criteria for excluding genes from this process:
20 1) If an ignore gene list file is provided, any gene in that file
21 is excluded from the set of genes used.
22 2) If an interaction score is zero, it is *KEPT* in the set of
23 genes used to generate BPMs with an interaction score of 0.
24
25 This gene information is then available at the 'geneint' module level,
26 since they are both used pervasively throughout BPM generation.
27
28 Finally, if we add the gene pair (g1, g2) with score S to the dictionary,
29 then we'll also add (g2, g1) with score S to the dictionary. This increases
30 memory usage but saves cpu cycles when looking up interaction scores.
31 Basically, we force the dictionary to be a reflexive matrix.
32 '''
33 ignore = set()
34 if conf.ignore:
35 if conf is not None:
36 for line in gzipOpen(conf.ignore):
37 ignore.add(line.strip())
38 else:
39 for line in gzipOpen(ignore_file):
40 ignore.add(line.strip())
41
42 if conf is not None:
43 reader = csv.reader(gzipOpen(conf.geneinter), delimiter='\t')
44 else:
45 reader = csv.reader(gzipOpen(geneinter_file), delimiter='\t')
46 for row in reader:
47 g1, g2, intscore = row[0], row[1], row[2]
48
49 genespace.add(g1)
50 genespace.add(g2)
51
52
53 if g1 in ignore or g2 in ignore:
54 continue
55
56
57 try:
58 ginter = float(intscore)
59 except ValueError:
60 ginter = 0.0
61
62 if (conf is not None and conf.squaring) or squaring:
63 if ginter < 0:
64 ginter = - (ginter ** 2)
65 else:
66 ginter = ginter ** 2
67 gis[(g1, g2)] = ginter
68 gis[(g2, g1)] = ginter
69
70 genes.add(g1)
71 genes.add(g2)
72
73 parallel.inc_counter(parallel.costs['load_genes'])
74
76 '''
77 A simple method to fetch the total number of genes. It uses a pretty shotty
78 memoization technique.
79
80 Actually, I don't think it's even necessary. I think taking the length
81 of a set is O(1) time complexity. Hmm...
82 '''
83 global numgenes
84
85 if not numgenes:
86 numgenes = len(genes)
87
88 assert numgenes > 0, 'geneint.load_genes must be called first'
89
90 return numgenes
91
93 '''
94 This indexing used to be a bit more complex, but the dict should contain
95 both (g1, g2) and (g2, g1). It uses more memory but speeds up execution.
96
97 Note that gis is a defaultdict of floats. So that if a particular gene pair
98 is absent, 0.0 will be returned.
99 '''
100 return gis[(g1, g2)]
101