| Home | Trees | Indices | Help |
|
|---|
|
|
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Copyright 2006 Zuza Software Foundation
5 #
6 # This file is part of translate.
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, see <http://www.gnu.org/licenses/>.
20
21 """Module to deal with different types and uses of segmentation"""
22
23 #XXX: This module is now deprecated: Use language specific segmenters in the
24 # lang package (character_iter, word_iter, sentence_iter, etc.).
25
26 punctuation = u".,;:!?-@#$%^*_()[]{}/\\'\"<>‘’‚‛“”„‟′″‴‵‶‷‹›«»±³¹²°¿©®×£¥"
27
28
30 """Returns an iterator over the characters in text."""
31 #We don't return more than one consecutive whitespace character
32 prev = 'A'
33 for c in text:
34 if c.isspace() and prev.isspace():
35 continue
36 prev = c
37 if not (c in punctuation):
38 yield c.lower()
39
40
44
45
47 """Returns an iterator over the words in text."""
48 #TODO: Consider replacing puctuation with space before split()
49 for w in text.split():
50 yield w.strip(punctuation).lower()
51
52
56
57
59 """Returns an iterator over the senteces in text."""
60 #TODO: This is very naïve. We really should consider all punctuation,
61 #and return the punctuation with the sentence.
62 #TODO: Search for capital letter start with next sentence to avoid
63 #confusion with abbreviations. And remember Afrikaans "'n" :-)
64 for s in text.split(". "):
65 yield s.strip()
66
67
71
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Tue Apr 12 18:12:01 2011 | http://epydoc.sourceforge.net |