Ìåíþ
Ãëàâíàÿ
Ñëó÷àéíàÿ ñòàòüÿ
Íàñòðîéêè
|
#!/usr/bin/env python
# -*- mode: python; coding: utf-8; -*-
import re
from htmlentitydefs import name2codepoint
u = u'\xa0' # nbsp
def entities(m):
if m.group(1):
return unichr(int(m.group(1), 16))
elif m.group(2):
return unichr(int(m.group(2), 10))
elif m.group(3) in name2codepoint:
return unichr(name2codepoint[m.group(3)])
return m.group()
repl = (
(ur'&(?:#x([0-9a-f]{1,4})|#([0-9]{1,4})|([a-zA-Z0-9]+));', entities),
# Hyphens and en dashes to pretty dashes
(u'\x96', u'-'),
(u'\x97', u'—'),
(ur'–', '-'), # – -> hyphen
(ur'&(#151|[nm]dash);', ur'—'), # -> —
(ur'( |\s)-{1,3} ', ur'\1— '), # hyphen -> —
(ur'(\d)--(\d)', ur'\1—\2'), # -> —
# Entities etc. Unicode chars
(ur'( | )+', ur' '),
(ur'\(tm\)', ur'™'),
(ur'([^\.])\.\.\.([^\.]|$)', ur'\1…\2'),
(ur'\+-(?!\+|-)', ur'±'),
(ur'~=', ur''),
(ur'\^2(\D)', ur'\1'),
(ur'\^3(\D)', ur'\1'),
(ur'([\w])\'([\w])', ur'\1’\2'), #'
(ur'¹¹', ur'¹'),
(ur'N°', ur'¹'),
# "" «»
(ur'(^|[\s\x02!|#\'"\\/\(;+-])"([^"]*)([^\s"\(|])"([^\w]|$)', ur'\1«\2\3»\4'),
(ur'(^|[\s\x02!|#\'"\\/\(;+-])"([^"]*)([^\s"\(|])"([^\w]|$)', ur'\1«\2\3»\4'),
(ur'«([^»]*)«([^»]*)»', ur'«\1„\2“'),
# Insert/delete spaces
(ur'(\S) (-{1,3}|—) (\S)', ur'\1'+u+ur'— \3'),
(ur'([À-ß]\.) ?([À-ß]\.) ?([À-ß])', ur'\1'+u+ur'\2'+u+ur'\3', re.U),
(ur'([^%\/\w]\d+?(?:[.,]\d+?)?) ?([%‰])(?!-[\w])', ur'\1'+u+ur'\2'), # 5 %
(ur'(\d) ([%‰])(?=-[\w])', ur'\1\2'), #5%-é
(ur'([¹§])(\s*)(\d)', ur'\1'+u+ur'\3'),
(ur'\( +', ur'('), (ur' +\)', ur')'), # inside ()
(ur'___+', ur''),
(ur'</?[a-z]+ ?/?>', ur''),
(ur'', ur'«'), (ur'', ur'»'),
)
def wikify(s):
for r in repl:
if len(r) == 3:
f, t, flag = r
else:
f, t = r
flag = re.I|re.U
c = re.compile(f, flag)
s = c.sub(t, s)
return s
|
|