From 6f9d3cbe88b64684313b9507409fae5b4977f80c Mon Sep 17 00:00:00 2001
From: Boris Bobrov <breton@cynicmansion.ru>
Date: Tue, 11 Mar 2014 15:50:01 +0500
Subject: [PATCH 1/2] added new slugify with unidecode
---
mediagoblin/tests/test_util.py | 6 ++++++
mediagoblin/tools/url.py | 18 ++----------------
setup.py | 1 +
3 files changed, 9 insertions(+), 16 deletions(-)
diff --git a/mediagoblin/tests/test_util.py b/mediagoblin/tests/test_util.py
index bc14f52..9d9b1c1 100644
a
|
b
|
def test_slugify():
|
77 | 77 | assert url.slugify(u'a w@lk in the park?') == u'a-w-lk-in-the-park' |
78 | 78 | assert url.slugify(u'a walk in the par\u0107') == u'a-walk-in-the-parc' |
79 | 79 | assert url.slugify(u'\u00E0\u0042\u00E7\u010F\u00EB\u0066') == u'abcdef' |
| 80 | # Russian |
| 81 | assert url.slugify(u'\u043f\u0440\u043e\u0433\u0443\u043b\u043a\u0430 ' |
| 82 | u'\u0432 \u043f\u0430\u0440\u043a\u0435') == u'progulka-v-parke' |
| 83 | # Korean |
| 84 | assert (url.slugify(u'\uacf5\uc6d0\uc5d0\uc11c \uc0b0\ucc45') == |
| 85 | u'gongweoneseo-sancaeg') |
80 | 86 | |
81 | 87 | def test_locale_to_lower_upper(): |
82 | 88 | """ |
diff --git a/mediagoblin/tools/url.py b/mediagoblin/tools/url.py
index d9179f9..657c037 100644
a
|
b
|
|
15 | 15 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
16 | 16 | |
17 | 17 | import re |
18 | | # This import *is* used; see word.encode('tranlit/long') below. |
19 | | from unicodedata import normalize |
20 | | |
21 | | try: |
22 | | import translitcodec |
23 | | USING_TRANSLITCODEC = True |
24 | | except ImportError: |
25 | | USING_TRANSLITCODEC = False |
26 | | |
| 18 | from unidecode import unidecode |
27 | 19 | |
28 | 20 | _punct_re = re.compile(r'[\t !"#:$%&\'()*\-/<=>?@\[\\\]^_`{|},.]+') |
29 | 21 | |
… |
… |
def slugify(text, delim=u'-'):
|
34 | 26 | """ |
35 | 27 | result = [] |
36 | 28 | for word in _punct_re.split(text.lower()): |
37 | | if USING_TRANSLITCODEC: |
38 | | word = word.encode('translit/long') |
39 | | else: |
40 | | word = normalize('NFKD', word).encode('ascii', 'ignore') |
41 | | |
42 | | if word: |
43 | | result.append(word) |
| 29 | result.extend(unidecode(word).split()) |
44 | 30 | return unicode(delim.join(result)) |
diff --git a/setup.py b/setup.py
index 7abd896..a3cc055 100644
a
|
b
|
try:
|
65 | 65 | 'pytz', |
66 | 66 | 'six', |
67 | 67 | 'oauthlib==0.5.0', |
| 68 | 'unidecode', |
68 | 69 | |
69 | 70 | ## Annoying. Please remove once we can! We only indirectly |
70 | 71 | ## use pbr, and currently it breaks things, presumably till |