Package pypln :: Package stores :: Module mongocorpus
[hide private]

Source Code for Module pypln.stores.mongocorpus

 1  #-*- coding:utf-8 -*- 
 2  """ Implement a MongoDB-backed NLTK Corpus reader 
 3   
 4  Adapted from "Text processing with NLTK cookbook" 
 5  To use, instantiate ``MongoDBCorpusReader`` with parameters for host, port, 
 6  database, collection and field. 
 7  """ 
 8   
 9  import pymongo 
10  from nltk.data import LazyLoader 
11  from nltk.tokenize import TreebankWordTokenizer 
12  from nltk.util import AbstractLazySequence, LazyMap, LazyConcatenation 
13   
14   
15 -class MongoDBLazySequence(AbstractLazySequence):
16 - def __init__(self, host='127.0.0.1', port=27017, db='test', 17 collection='documents', field='text'):
18 self.conn = pymongo.Connection(host, port) 19 self.collection = self.conn[db][collection] 20 self.field = field
21
22 - def __len__(self):
23 return self.collection.count()
24
25 - def iterate_from(self, start):
26 f = lambda d: d.get(self.field, '') 27 return iter(LazyMap(f, self.collection.find(fields=[self.field], 28 skip=start)))
29
30 -class MongoDBCorpusReader(object):
31 """ Corpus Reader to deal with text stored on a MongoDB collection """ 32 #TODO: introduce language specification to select appropriate tokenizers
33 - def __init__(self, word_tokenizer=None, sent_tokenizer=None, **kwargs):
34 if word_tokenizer is None: 35 word_tokenizer = TreebankWordTokenizer() 36 if sent_tokenizer is None: 37 sent_tokenizer = LazyLoader('tokenizers/punkt/english.pickle') 38 self._seq = MongoDBLazySequence(**kwargs) 39 self._word_tokenize = word_tokenizer.tokenize 40 self._sent_tokenize = sent_tokenizer.tokenize
41
42 - def text(self):
43 """ Return lazy iterator over the texts in the corpus. """ 44 return self._seq
45 46 #TODO: create decorators to cache token and sentence lists on the database
47 - def words(self):
48 return LazyConcatenation(LazyMap(self._word_tokenize,self.text()))
49 50 #TODO: change to 'sentences'?
51 - def sents(self):
52 return LazyConcatenation(LazyMap(self._sent_tokenize,self.text()))
53