1
2
3 __meta__ = {'from': 'gridfs-file',
4 'requires': ['contents'],
5 'to': 'document',
6 'provides': ['text', 'metadata'],}
7
8 import shlex
9 from subprocess import Popen, PIPE
10 from mimetypes import guess_type
11 from re import compile as re_compile
12
13
14 regexp_tags = re_compile(r'(<[ \t]*(/?[a-zA-Z]*)[^>]*>)')
15
16 -def parse_html(html, remove_tags=None, remove_inside=None, replace_with=' '):
17 data = regexp_tags.split(html)
18 content_between = data[::3]
19 complete_tags = data[1::3]
20 tag_names = data[2::3]
21 for index, tag_name in enumerate(tag_names):
22 search_tag = tag_name
23 if tag_name and tag_name[0] == '/':
24 search_tag = tag_name[1:]
25 if remove_tags and search_tag not in remove_inside:
26 complete_tags[index] = replace_with
27 if remove_inside and tag_name in remove_inside:
28 remove_to = tag_names.index('/' + tag_name, index)
29 total_to_remove = remove_to - index + 1
30 complete_tags[index:remove_to + 1] = [''] * total_to_remove
31 content_between[index + 2:remove_to + 1] = \
32 [''] * (total_to_remove - 2)
33 content_between[index + 1] = '\n'
34 complete_tags.append('')
35 return ''.join(sum(zip(content_between, complete_tags), tuple()))
36
47
49 pdf2text = Popen(shlex.split('pdftotext -q - -'), stdin=PIPE, stdout=PIPE,
50 stderr=PIPE)
51 pdfinfo = Popen(shlex.split('pdfinfo -meta -'), stdin=PIPE, stdout=PIPE,
52 stderr=PIPE)
53 text, text_err = pdf2text.communicate(input=data)
54 meta_out, meta_err = pdfinfo.communicate(input=data)
55 try:
56 metadata = get_pdf_metadata(meta_out)
57 except:
58 metadata = {}
59
60 if not (text and metadata):
61 return None, None
62 elif not text_err:
63 return text.strip(), None if meta_err else metadata
64 else:
65 return None, None
66
68 file_mime_type = guess_type(file_data['name'])[0]
69 metadata = None
70 if file_mime_type == 'text/plain':
71 text = file_data['contents']
72 elif file_mime_type == 'text/html':
73 text = parse_html(file_data['contents'], True, ['script', 'style'])
74 elif file_mime_type == 'application/pdf':
75 text, metadata = extract_pdf(file_data['contents'])
76 return {'text': text, 'metadata': metadata}
77
78
79
80
81
82
83