1 '''
2 enrichment.py provides functions for gathering, reading, writing and sorting
3 GO enrichment information from Funcassociate.
4 '''
5 import re
6
7 from bpm import conf, faread, parallel
8
9 -def enrich(modulecnt, (bpmi, modi, genes)):
10 '''
11 Initiates a request to Funcassociate and returns a dictionary of goterms.
12
13 :param modulecnt: The total number of modules in the BPM file.
14 :param bpmi, modi, genes: A tuple representing a module. 'bpmi' is the
15 BPM index number, 'modi' is the module index
16 number, and 'genes' is a list of gene names
17 in the module.
18 :return: A four-tuple of the input module and its associated go terms.
19 '''
20 goterms = faread.functionate(genes, min(10000, max(1000, modulecnt)))
21
22 parallel.inc_counter()
23 parallel.print_progress()
24
25 return bpmi, modi, genes, goterms
26
28 '''
29 Sorts the keys of a goterms dictionary according to the current
30 configuration.
31 '''
32 if conf is None:
33 reverse = False
34 sort_by = 'p'
35 else:
36 reverse = conf.order_go == 'desc'
37 sort_by = conf.sort_go_by
38
39 if sort_by == 'p':
40 return sorted(goterms, key=lambda acc: goterms[acc]['p'],
41 reverse=reverse)
42 elif sort_by == 'accession':
43 return sorted(goterms, reverse=reverse)
44 elif sort_by == 'name':
45 return sorted(goterms, key=lambda acc: goterms[acc]['name'].lower(),
46 reverse=reverse)
47 elif sort_by == 'num_genes_with':
48 return sorted(goterms, key=lambda acc: goterms[acc]['num_with'],
49 reverse=reverse)
50
51 assert False, 'Invalid sort by column.'
52
54 '''
55 Parses raw BPM text (everything between the '>' and '>') from a gobpm file,
56 and turns it into a goterms dictionary keyed by GO accession.
57
58 The fields in each dictionary entry represent the GO information returned
59 by Funcassociate. Namely, a p-value, the number of genes in the module
60 with the same enrichment, the number of genes in the query, the name of the
61 GO term, and finally, the names of the genes in the query with the
62 corresponding GO enrichment.
63 '''
64 lines = map(str.strip, bpmtext.splitlines())
65
66 bpmids = lines[0]
67 m = re.search('BPM(\d+)/Module(\d+)', bpmids)
68 bpmi, modi = int(m.group(1)), int(m.group(2))
69
70 genes = set(map(str.strip, lines[1].split('\t')))
71
72 goterms = {}
73 gotermlns = lines[2:]
74 for gotermln in gotermlns:
75 columns = gotermln.split('\t')
76 accession = columns[0].strip()
77 p = columns[1].strip()
78 num_with, num_query = map(int, columns[2].strip().split('/'))
79 name = columns[3]
80
81 if len(columns) == 5:
82 egenes = set(columns[4].strip().split())
83 else:
84 egenes = set()
85
86 assert len(egenes) == 0 or len(egenes) == num_with, columns
87
88 goterms[accession] = {
89 'p': float(p),
90 'num_with': num_with,
91 'num_query': num_query,
92 'name': name,
93 'genes': egenes,
94 }
95
96 return bpmi, modi, genes, goterms
97
99 '''
100 Writes a BPM entry with GO enrichment information in gobpm file format.
101 '''
102 obpm = ['> BPM%d/Module%d' % (bpmi, modi)]
103 obpm.append('\t'.join(genes))
104
105 for accession in sortgo(goterms):
106 goterm = goterms[accession]
107
108 if conf is not None and conf.hide_enriched_genes:
109 egenes = ''
110 else:
111 egenes = '\t%s' % ' '.join(goterm['genes'])
112
113 frac = '%d/%d' % (goterm['num_with'], goterm['num_query'])
114 obpm.append('%s\t%f\t%s\t%s%s'
115 % (accession, goterm['p'], frac, goterm['name'], egenes))
116
117 return '\n'.join(obpm)
118