Ru.Wikipedia.Org - Участник:XBot/fixfilmbox.py

Меню
Главная
Случайная статья
Настройки
Участник:XBot/fixfilmbox.py
Материал из https://ru.wikipedia.org
# -*- coding: utf-8  -*-

import sys, re, time, unicodedata
import wikipedia, catlib, pagegenerators
import codecs

class ParseErr(Exception):
    pass

class BoxTemplate:
	_title = None
	_pars  = {}
	_text  = ''
	
	def __init__(self, templ_str):
		self.parse(templ_str)
		
	def parse(self, templ_str):
		tmpl = re.match(r'(?P<title>[^\|]+)(?P<pars>.+)', templ_str, re.DOTALL)
		self._title = tmpl.group('title').strip()
		pars        = tmpl.group('pars')

		self._pars = {}
		res = ''
		for m in re.findall(r'(\|([^=\|]+)=(([^\[{\|]*(\[[^\]]*\]|{[^}]*})?)*))', pars):
		    	res = res + m[0]
		    	self._pars[m[1].strip()] = m[2]
		
		if res == pars:
		    self._text = templ_str
		else:
		    raise ParseErr

	def tostring(self):
	    for k,v in self._pars.items():
		res   = re.search(r'(?P<left>%s[\s]*=)(?P<right>([^\[{\|]*(\[[^\]]*\]|{[^}]*})?)*)' % re.escape(k), self._text)
		self._text = self._text.replace(res.group('left') + res.group('right'), res.group('left') + v)
	    
	    return self._text
		
	def debug(self, file):
		print >> file, self._title
		for k,v in self._pars.items():
			print >> file, '%s =%s' % (k, v)
		
	def hasParam(self, k):
		return k in self._pars
	
	def getParam(self, k):
		return self._pars[k]
	
	def setParam(self, k, val):
		self._pars[k] = val
		
	def Params(self):
	    return set(self._pars.keys())
		
class LazyPage:
    _cats   = None
    _templs = None
    _page   = None
    _site   = wikipedia.getSite()
    _cat_namespace = _site.category_namespaces()[0]
    
	
    def __init__(self, page):
	self._page = page

    def _fill_cats(self):
	if self._cats == None:
    	    self._cats = self._page.categories()

    def _fill_templs(self):
	if self._templs == None:
	    self._templs = {}
	    junk = set()
	    text = self._page.get()
	    
	    for templ in re.findall(r'{{(?P<t>[^\|}]+)}}', text):
		self._templs[templ] = templ
		
	    for x in re.findall(r'{{(([^}\|\s]+)[\s]*\|([^}{]({{[^}{]+}})?)+)}}', text):
		if x[1] in self._templs:
		    junk.add(x[1])
		else:
		    self._templs[x[1]] = x[0]
	    
	    for name in junk:
		del self._templs[name]
		
    def delCat(self, cat_name):
	self._fill_cats()
	
        p = re.compile(self._cat_namespace + ':' + cat_name)
	for catpl in self._cats:
	    if p.match(catpl.title()):
		self._cats.remove(catpl)
		wikipedia.output(u"%s removed from %s." % (self._page.title(),catpl.title()))
		return True
	
	return False

    def getTempl(self, t_head):
	self._fill_templs()
	
	if (t_head[0].lower() + t_head[1:]) in self._templs:
	    return self._templs[t_head[0].lower() + t_head[1:]]
	
	if (t_head[0].upper() + t_head[1:]) in self._templs:
	    return self._templs[t_head[0].upper() + t_head[1:]]
	    
	return None
	
    def addTempl(self, t_head, t_all = None):
	self._fill_templs()
	
	if t_all == None: t_all = t_head
	
	if (t_head[0].lower() + t_head[1:]) in self._templs:
	    return False
	
	if (t_head[0].upper() + t_head[1:]) in self._templs:
	    return False
	
	self._templs[t_head] = t_all
	wikipedia.output(u"template {{%s}} added to %s." % (t_head, self._page.title()))
	return True
	    
    def changeTempl(self, t_head, t_all):
	self._fill_templs()
	
	if (t_head[0].lower() + t_head[1:]) in self._templs:
	    t_head = t_head[0].lower() + t_head[1:]
	else:
	    t_head = t_head[0].upper() + t_head[1:]
	
	if self._templs[t_head] == t_all:
	    return False
	else:
	    self._templs[t_head] = t_all
	    wikipedia.output(u"template {{%s}} changed." % t_head)
	    return True

    def tostring(self):
	text = self._page.get()
	
	if self._cats:
	    text = wikipedia.replaceCategoryLinks(text, self._cats)

	if self._templs:
	    for name, val in self._templs.items():
		if re.search(u'{{%s\s*(\||})' % name, text):
		    text = re.sub(r'{{%s\s*\|([^}{]({{[^}{]+}})?)+}}' % name, u'{{%s}}' % val, text)
		else:
		    text = '{{%s}}\n' % val + text
	
	return text


site = wikipedia.getSite()
cat_namespace = site.category_namespaces()[0]

log_file    = codecs.open('log.txt', 'w', 'utf-8')
errbox_file = codecs.open('errbox.txt', 'w', 'utf-8')

film_box_templ   = u'Фильм'
year_str         = u'Год'
imdb_str         = u'IMDbLink'
film_by_alphabet = u'Фильмы по алфавиту'
film_by_year     = u'Фильмы [0-9]{4} года'
valid_params     = set([
u'РусНаз', 
u'ОригНаз',
u'Изображение',
u'Жанр',
u'Режиссёр',
u'Продюсер',
u'Сценарист',
u'Актёры',
u'Оператор',
u'Композитор',
u'Компания',
u'Время',
u'Страна',
u'Год',
u'Бюджет',
u'imdb_id',
u'IMDbLink'
])

def log(string):
    log_file.write(string + '\n')
    log_file.flush()

def fix_kinopage(page):
    if not page.canBeEdited() or not page.botMayEdit():
	log(u'Warning: Page [[%s]] is bloked. Skipping. ' % page.title())
	return
	
    if page.isRedirectPage():
	log(u'Warning: Page [[%s]] is a redirect-page. Skipping' % page.title())
	return
		
    subst    = []
    lp       = LazyPage(page)
    descr    = ''

    box_text = lp.getTempl(film_box_templ)
    
    if box_text:
	box = BoxTemplate(box_text)
	    
	if box.hasParam(year_str):
	    year = box.getParam(year_str)
	
	    paterns = [
	     r'\s*(?P<par>(?P<val>[0-9]{4}))\s*$', 
	     r'\s*(?P<par>\[\[(?P<val>[0-9]{4})\]\])\s*$', 
	     r'\s*(?P<par>\[\[[^\|]+\|(?P<val>[0-9]{4})\]\])\s*$'
	    ]
	    
	    for pat in paterns:
		new_year = re.match(pat, year)
		if new_year:
		    box.setParam(year_str, year.replace(new_year.group('par'), new_year.group('val')))
		    b = lp.delCat(film_by_alphabet)
		    c = lp.delCat(film_by_year)
		    if b or c: 
			descr += u'робот удалил лишние категории'
		    break
	
	    if lp.changeTempl(film_box_templ, box.tostring()):
		descr += [u'робот ', u', '][descr != ''] + u'изменил шаблон фильма'
	
	if box.hasParam(imdb_str):
	    imdb = box.getParam(imdb_str)
	    if re.search(r'http:\/\/www.imdb.com\/find\?s=', imdb):
		lp.addTempl('IMDb-stub')
		descr += [u'робот ', u', '][descr != ''] + u'добавил шаблон {{IMDB-stub}}'
	
	if box.Params() > valid_params:
	    errbox_file.write(u'# [[%s]]:' % page.title())
	    
	    for m in box.Params() - valid_params:
		errbox_file.write(u' %s;' % m)
	    errbox_file.write('\n')
    else:
	lp.addTempl(u'no filmbox', None)
	descr = u'робот добавил шаблон {{no filmbox}}'

    if descr != '':
	wikipedia.setAction(descr)
    
	try:
    	    page.put(lp.tostring())
	except wikipedia.EditConflict:
    	    log(u'Error:Edit conflict at saving page [[%s]]. Skipping.' % page.title())


start = "!"

generator = pagegenerators.CategorizedPageGenerator(catlib.Category(site, cat_namespace + ':' + film_by_alphabet), start = start)
generator = pagegenerators.PreloadingGenerator(generator)

for page in generator:
    try:
	fix_kinopage(page)
    except ParseErr:
	log(u'Error: Filmbox in page [[%s]] is bad-formed. Skipping.' % page.title())
    except:
	log(u'Error: Processing of page [[%s]] has failed. Skipping.' % page.title())

log_file.close()
errbox_file.close()