Меню
Главная
Случайная статья
Настройки
|
# -*- coding: utf-8 -*-
import sys, re, time, unicodedata
import wikipedia, catlib, pagegenerators
import codecs
class ParseErr(Exception):
pass
class BoxTemplate:
_title = None
_pars = {}
_text = ''
def __init__(self, templ_str):
self.parse(templ_str)
def parse(self, templ_str):
tmpl = re.match(r'(?P<title>[^\|]+)(?P<pars>.+)', templ_str, re.DOTALL)
self._title = tmpl.group('title').strip()
pars = tmpl.group('pars')
self._pars = {}
res = ''
for m in re.findall(r'(\|([^=\|]+)=(([^\[{\|]*(\[[^\]]*\]|{[^}]*})?)*))', pars):
res = res + m[0]
self._pars[m[1].strip()] = m[2]
if res == pars:
self._text = templ_str
else:
raise ParseErr
def tostring(self):
for k,v in self._pars.items():
res = re.search(r'(?P<left>%s[\s]*=)(?P<right>([^\[{\|]*(\[[^\]]*\]|{[^}]*})?)*)' % re.escape(k), self._text)
self._text = self._text.replace(res.group('left') + res.group('right'), res.group('left') + v)
return self._text
def debug(self, file):
print >> file, self._title
for k,v in self._pars.items():
print >> file, '%s =%s' % (k, v)
def hasParam(self, k):
return k in self._pars
def getParam(self, k):
return self._pars[k]
def setParam(self, k, val):
self._pars[k] = val
def Params(self):
return set(self._pars.keys())
class LazyPage:
_cats = None
_templs = None
_page = None
_site = wikipedia.getSite()
_cat_namespace = _site.category_namespaces()[0]
def __init__(self, page):
self._page = page
def _fill_cats(self):
if self._cats == None:
self._cats = self._page.categories()
def _fill_templs(self):
if self._templs == None:
self._templs = {}
junk = set()
text = self._page.get()
for templ in re.findall(r'{{(?P<t>[^\|}]+)}}', text):
self._templs[templ] = templ
for x in re.findall(r'{{(([^}\|\s]+)[\s]*\|([^}{]({{[^}{]+}})?)+)}}', text):
if x[1] in self._templs:
junk.add(x[1])
else:
self._templs[x[1]] = x[0]
for name in junk:
del self._templs[name]
def delCat(self, cat_name):
self._fill_cats()
p = re.compile(self._cat_namespace + ':' + cat_name)
for catpl in self._cats:
if p.match(catpl.title()):
self._cats.remove(catpl)
wikipedia.output(u"%s removed from %s." % (self._page.title(),catpl.title()))
return True
return False
def getTempl(self, t_head):
self._fill_templs()
if (t_head[0].lower() + t_head[1:]) in self._templs:
return self._templs[t_head[0].lower() + t_head[1:]]
if (t_head[0].upper() + t_head[1:]) in self._templs:
return self._templs[t_head[0].upper() + t_head[1:]]
return None
def addTempl(self, t_head, t_all = None):
self._fill_templs()
if t_all == None: t_all = t_head
if (t_head[0].lower() + t_head[1:]) in self._templs:
return False
if (t_head[0].upper() + t_head[1:]) in self._templs:
return False
self._templs[t_head] = t_all
wikipedia.output(u"template {{%s}} added to %s." % (t_head, self._page.title()))
return True
def changeTempl(self, t_head, t_all):
self._fill_templs()
if (t_head[0].lower() + t_head[1:]) in self._templs:
t_head = t_head[0].lower() + t_head[1:]
else:
t_head = t_head[0].upper() + t_head[1:]
if self._templs[t_head] == t_all:
return False
else:
self._templs[t_head] = t_all
wikipedia.output(u"template {{%s}} changed." % t_head)
return True
def tostring(self):
text = self._page.get()
if self._cats:
text = wikipedia.replaceCategoryLinks(text, self._cats)
if self._templs:
for name, val in self._templs.items():
if re.search(u'{{%s\s*(\||})' % name, text):
text = re.sub(r'{{%s\s*\|([^}{]({{[^}{]+}})?)+}}' % name, u'{{%s}}' % val, text)
else:
text = '{{%s}}\n' % val + text
return text
site = wikipedia.getSite()
cat_namespace = site.category_namespaces()[0]
log_file = codecs.open('log.txt', 'w', 'utf-8')
errbox_file = codecs.open('errbox.txt', 'w', 'utf-8')
film_box_templ = u'Фильм'
year_str = u'Год'
imdb_str = u'IMDbLink'
film_by_alphabet = u'Фильмы по алфавиту'
film_by_year = u'Фильмы [0-9]{4} года'
valid_params = set([
u'РусНаз',
u'ОригНаз',
u'Изображение',
u'Жанр',
u'Режиссёр',
u'Продюсер',
u'Сценарист',
u'Актёры',
u'Оператор',
u'Композитор',
u'Компания',
u'Время',
u'Страна',
u'Год',
u'Бюджет',
u'imdb_id',
u'IMDbLink'
])
def log(string):
log_file.write(string + '\n')
log_file.flush()
def fix_kinopage(page):
if not page.canBeEdited() or not page.botMayEdit():
log(u'Warning: Page [[%s]] is bloked. Skipping. ' % page.title())
return
if page.isRedirectPage():
log(u'Warning: Page [[%s]] is a redirect-page. Skipping' % page.title())
return
subst = []
lp = LazyPage(page)
descr = ''
box_text = lp.getTempl(film_box_templ)
if box_text:
box = BoxTemplate(box_text)
if box.hasParam(year_str):
year = box.getParam(year_str)
paterns = [
r'\s*(?P<par>(?P<val>[0-9]{4}))\s*$',
r'\s*(?P<par>\[\[(?P<val>[0-9]{4})\]\])\s*$',
r'\s*(?P<par>\[\[[^\|]+\|(?P<val>[0-9]{4})\]\])\s*$'
]
for pat in paterns:
new_year = re.match(pat, year)
if new_year:
box.setParam(year_str, year.replace(new_year.group('par'), new_year.group('val')))
b = lp.delCat(film_by_alphabet)
c = lp.delCat(film_by_year)
if b or c:
descr += u'робот удалил лишние категории'
break
if lp.changeTempl(film_box_templ, box.tostring()):
descr += [u'робот ', u', '][descr != ''] + u'изменил шаблон фильма'
if box.hasParam(imdb_str):
imdb = box.getParam(imdb_str)
if re.search(r'http:\/\/www.imdb.com\/find\?s=', imdb):
lp.addTempl('IMDb-stub')
descr += [u'робот ', u', '][descr != ''] + u'добавил шаблон {{IMDB-stub}}'
if box.Params() > valid_params:
errbox_file.write(u'# [[%s]]:' % page.title())
for m in box.Params() - valid_params:
errbox_file.write(u' %s;' % m)
errbox_file.write('\n')
else:
lp.addTempl(u'no filmbox', None)
descr = u'робот добавил шаблон {{no filmbox}}'
if descr != '':
wikipedia.setAction(descr)
try:
page.put(lp.tostring())
except wikipedia.EditConflict:
log(u'Error:Edit conflict at saving page [[%s]]. Skipping.' % page.title())
start = "!"
generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site, cat_namespace + ':' + film_by_alphabet), start = start)
generator = pagegenerators.PreloadingGenerator(generator)
for page in generator:
try:
fix_kinopage(page)
except ParseErr:
log(u'Error: Filmbox in page [[%s]] is bad-formed. Skipping.' % page.title())
except:
log(u'Error: Processing of page [[%s]] has failed. Skipping.' % page.title())
log_file.close()
errbox_file.close()
|
|