Package bpm :: Module enrichment
[hide private]
[frames] | no frames]

Source Code for Module bpm.enrichment

  1  ''' 
  2  enrichment.py provides functions for gathering, reading, writing and sorting 
  3  GO enrichment information from Funcassociate. 
  4  ''' 
  5  import re 
  6   
  7  from bpm import conf, faread, parallel 
  8   
9 -def enrich(modulecnt, (bpmi, modi, genes)):
10 ''' 11 Initiates a request to Funcassociate and returns a dictionary of goterms. 12 13 :param modulecnt: The total number of modules in the BPM file. 14 :param bpmi, modi, genes: A tuple representing a module. 'bpmi' is the 15 BPM index number, 'modi' is the module index 16 number, and 'genes' is a list of gene names 17 in the module. 18 :return: A four-tuple of the input module and its associated go terms. 19 ''' 20 goterms = faread.functionate(genes, min(10000, max(1000, modulecnt))) 21 22 parallel.inc_counter() 23 parallel.print_progress() 24 25 return bpmi, modi, genes, goterms
26
27 -def sortgo(goterms):
28 ''' 29 Sorts the keys of a goterms dictionary according to the current 30 configuration. 31 ''' 32 if conf is None: 33 reverse = False 34 sort_by = 'p' 35 else: 36 reverse = conf.order_go == 'desc' 37 sort_by = conf.sort_go_by 38 39 if sort_by == 'p': 40 return sorted(goterms, key=lambda acc: goterms[acc]['p'], 41 reverse=reverse) 42 elif sort_by == 'accession': 43 return sorted(goterms, reverse=reverse) 44 elif sort_by == 'name': 45 return sorted(goterms, key=lambda acc: goterms[acc]['name'].lower(), 46 reverse=reverse) 47 elif sort_by == 'num_genes_with': 48 return sorted(goterms, key=lambda acc: goterms[acc]['num_with'], 49 reverse=reverse) 50 51 assert False, 'Invalid sort by column.'
52
53 -def read_bpm(bpmtext):
54 ''' 55 Parses raw BPM text (everything between the '>' and '>') from a gobpm file, 56 and turns it into a goterms dictionary keyed by GO accession. 57 58 The fields in each dictionary entry represent the GO information returned 59 by Funcassociate. Namely, a p-value, the number of genes in the module 60 with the same enrichment, the number of genes in the query, the name of the 61 GO term, and finally, the names of the genes in the query with the 62 corresponding GO enrichment. 63 ''' 64 lines = map(str.strip, bpmtext.splitlines()) 65 66 bpmids = lines[0] 67 m = re.search('BPM(\d+)/Module(\d+)', bpmids) 68 bpmi, modi = int(m.group(1)), int(m.group(2)) 69 70 genes = set(map(str.strip, lines[1].split('\t'))) 71 72 goterms = {} 73 gotermlns = lines[2:] 74 for gotermln in gotermlns: 75 columns = gotermln.split('\t') 76 accession = columns[0].strip() 77 p = columns[1].strip() 78 num_with, num_query = map(int, columns[2].strip().split('/')) 79 name = columns[3] 80 81 if len(columns) == 5: 82 egenes = set(columns[4].strip().split()) 83 else: 84 egenes = set() 85 86 assert len(egenes) == 0 or len(egenes) == num_with, columns 87 88 goterms[accession] = { 89 'p': float(p), 90 'num_with': num_with, 91 'num_query': num_query, 92 'name': name, 93 'genes': egenes, 94 } 95 96 return bpmi, modi, genes, goterms
97
98 -def write_bpm(bpmi, modi, genes, goterms):
99 ''' 100 Writes a BPM entry with GO enrichment information in gobpm file format. 101 ''' 102 obpm = ['> BPM%d/Module%d' % (bpmi, modi)] 103 obpm.append('\t'.join(genes)) 104 105 for accession in sortgo(goterms): 106 goterm = goterms[accession] 107 108 if conf is not None and conf.hide_enriched_genes: 109 egenes = '' 110 else: 111 egenes = '\t%s' % ' '.join(goterm['genes']) 112 113 frac = '%d/%d' % (goterm['num_with'], goterm['num_query']) 114 obpm.append('%s\t%f\t%s\t%s%s' 115 % (accession, goterm['p'], frac, goterm['name'], egenes)) 116 117 return '\n'.join(obpm)
118