pypln.stores.mongocorpus

1 #-*- coding:utf-8 -*- 2 """ Implement a MongoDB-backed NLTK Corpus reader 3 4 Adapted from "Text processing with NLTK cookbook" 5 To use, instantiate ``MongoDBCorpusReader`` with parameters for host, port, 6 database, collection and field. 7 """ 8 9 import pymongo 10 from nltk.data import LazyLoader 11 from nltk.tokenize import TreebankWordTokenizer 12 from nltk.util import AbstractLazySequence, LazyMap, LazyConcatenation 13 14

15 -class MongoDBLazySequence(AbstractLazySequence):

16 - def __init__(self, host='127.0.0.1', port=27017, db='test', 17 collection='documents', field='text'):

18 self.conn = pymongo.Connection(host, port) 19 self.collection = self.conn[db][collection] 20 self.field = field

21

22 - def __len__(self):

23 return self.collection.count()

24

25 - def iterate_from(self, start):

26 f = lambda d: d.get(self.field, '') 27 return iter(LazyMap(f, self.collection.find(fields=[self.field], 28 skip=start)))

29

30 -class MongoDBCorpusReader(object):

31 """ Corpus Reader to deal with text stored on a MongoDB collection """ 32 #TODO: introduce language specification to select appropriate tokenizers

33 - def __init__(self, word_tokenizer=None, sent_tokenizer=None, **kwargs):

34 if word_tokenizer is None: 35 word_tokenizer = TreebankWordTokenizer() 36 if sent_tokenizer is None: 37 sent_tokenizer = LazyLoader('tokenizers/punkt/english.pickle') 38 self._seq = MongoDBLazySequence(**kwargs) 39 self._word_tokenize = word_tokenizer.tokenize 40 self._sent_tokenize = sent_tokenizer.tokenize

41

42 - def text(self):

43 """ Return lazy iterator over the texts in the corpus. """ 44 return self._seq

45 46 #TODO: create decorators to cache token and sentence lists on the database

47 - def words(self):

48 return LazyConcatenation(LazyMap(self._word_tokenize,self.text()))

49 50 #TODO: change to 'sentences'?

51 - def sents(self):

52 return LazyConcatenation(LazyMap(self._sent_tokenize,self.text()))

53

Source Code for Module pypln.stores.mongocorpus