Ìåíþ

Ãëàâíàÿ
Ñëó÷àéíàÿ ñòàòüÿ
Íàñòðîéêè
Ó÷àñòíèê:LankLinkBot/wikificator.py
Ìàòåðèàë èç https://ru.wikipedia.org

#!/usr/bin/env python
# -*- mode: python; coding: utf-8; -*-
import re
from htmlentitydefs import name2codepoint

u = u'\xa0' # nbsp

def entities(m):
    if m.group(1):
        return unichr(int(m.group(1), 16))
    elif m.group(2):
        return unichr(int(m.group(2), 10))
    elif m.group(3) in name2codepoint:
        return unichr(name2codepoint[m.group(3)])
    return m.group()

repl = (
    (ur'&(?:#x([0-9a-f]{1,4})|#([0-9]{1,4})|([a-zA-Z0-9]+));', entities),
    # Hyphens and en dashes to pretty dashes
    (u'\x96', u'-'),
    (u'\x97', u'—'),
    (ur'–', '-'), # – ->  hyphen
    (ur'&(#151|[nm]dash);', ur'—'), # -> —
    (ur'( |\s)-{1,3} ', ur'\1— '), # hyphen -> —
    (ur'(\d)--(\d)', ur'\1\2'), # -> —

    # Entities etc.  Unicode chars
    (ur'( | )+', ur' '),
    (ur'\(tm\)', ur'™'),
    (ur'([^\.])\.\.\.([^\.]|$)', ur'\1\2'),
    (ur'\+-(?!\+|-)', ur'±'),
    (ur'~=', ur''),
    (ur'\^2(\D)', ur'\1'),
    (ur'\^3(\D)', ur'\1'),
    (ur'([\w])\'([\w])', ur'\1\2'), #'
    (ur'¹¹', ur'¹'),
    (ur'N°', ur'¹'),

    # ""  «»
    (ur'(^|[\s\x02!|#\'"\\/\(;+-])"([^"]*)([^\s"\(|])"([^\w]|$)', ur'\1«\2\3»\4'),
    (ur'(^|[\s\x02!|#\'"\\/\(;+-])"([^"]*)([^\s"\(|])"([^\w]|$)', ur'\1«\2\3»\4'),
    (ur'«([^»]*)«([^»]*)»', ur\1\2“'),

    # Insert/delete spaces
    (ur'(\S) (-{1,3}|—) (\S)', ur'\1'+u+ur'— \3'),
    (ur'([À-ß]\.) ?([À-ß]\.) ?([À-ß])', ur'\1'+u+ur'\2'+u+ur'\3', re.U),
    (ur'([^%\/\w]\d+?(?:[.,]\d+?)?) ?([%‰])(?!-[\w])', ur'\1'+u+ur'\2'), # 5 %
    (ur'(\d) ([%‰])(?=-[\w])', ur'\1\2'), #5%-é
    (ur'([¹§])(\s*)(\d)', ur'\1'+u+ur'\3'),
    (ur'\( +', ur'('), (ur' +\)', ur')'), # inside ()
    (ur'___+', ur''),
    (ur'</?[a-z]+ ?/?>', ur''),
    (ur'', ur'«'), (ur'', ur'»'),
    )

def wikify(s):
    for r in repl:
        if len(r) == 3:
            f, t, flag = r
        else:
            f, t = r
            flag = re.I|re.U
        c = re.compile(f, flag)
        s = c.sub(t, s)
    return s
Downgrade Counter