1
2 """ Implement a MongoDB-backed NLTK Corpus reader
3
4 Adapted from "Text processing with NLTK cookbook"
5 To use, instantiate ``MongoDBCorpusReader`` with parameters for host, port,
6 database, collection and field.
7 """
8
9 import pymongo
10 from nltk.data import LazyLoader
11 from nltk.tokenize import TreebankWordTokenizer
12 from nltk.util import AbstractLazySequence, LazyMap, LazyConcatenation
13
14
16 - def __init__(self, host='127.0.0.1', port=27017, db='test',
17 collection='documents', field='text'):
18 self.conn = pymongo.Connection(host, port)
19 self.collection = self.conn[db][collection]
20 self.field = field
21
23 return self.collection.count()
24
26 f = lambda d: d.get(self.field, '')
27 return iter(LazyMap(f, self.collection.find(fields=[self.field],
28 skip=start)))
29
31 """ Corpus Reader to deal with text stored on a MongoDB collection """
32
33 - def __init__(self, word_tokenizer=None, sent_tokenizer=None, **kwargs):
34 if word_tokenizer is None:
35 word_tokenizer = TreebankWordTokenizer()
36 if sent_tokenizer is None:
37 sent_tokenizer = LazyLoader('tokenizers/punkt/english.pickle')
38 self._seq = MongoDBLazySequence(**kwargs)
39 self._word_tokenize = word_tokenizer.tokenize
40 self._sent_tokenize = sent_tokenizer.tokenize
41
43 """ Return lazy iterator over the texts in the corpus. """
44 return self._seq
45
46
48 return LazyConcatenation(LazyMap(self._word_tokenize,self.text()))
49
50
52 return LazyConcatenation(LazyMap(self._sent_tokenize,self.text()))
53