Меню
Главная
Случайная статья
Настройки
|
{
"cells": [
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Страниц: 19277\n"
]
}
],
"source": [
"# названия страниц из файла\n",
"with open('template inclusion names.txt', 'r', encoding='utf8') as file:\n",
" page_names = file.readlines()\n",
"page_names = [str.strip() for str in page_names]\n",
"print('Страниц:', len(page_names))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Случайная выборка: 500\n",
"Начало: ['(132524) APL', '(1941) Вильд', '(279) Туле']\n"
]
}
],
"source": [
"# случайная выборка\n",
"import random\n",
"page_names_sample = sorted(random.sample(page_names, 500))\n",
"print('Случайная выборка:', len(page_names_sample))\n",
"print('Начало:', page_names_sample[:3])"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Страниц сохранено: 500\n"
]
}
],
"source": [
"# код включений из интернета\n",
"import urllib\n",
"import IPython\n",
"pages = ['' for i in page_names_sample]\n",
"for i, page_name in enumerate(page_names_sample):\n",
" url = 'https://ru.wikipedia.org/w/index.php?action=raw&title=' + urllib.parse.quote(page_name)\n",
" wiki = urllib.request.urlopen(url).read().decode('utf-8')\n",
" pages[i] = wiki\n",
" IPython.display.clear_output()\n",
" print('Страниц сохранено: ', i + 1)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"#pages[1]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выделено для всех 500 страниц\n"
]
}
],
"source": [
"# сформировать строку таблицы из кода страницы\n",
"import re #''\n",
"# press[ _]release|press\n",
"# journal|magazine|paper\n",
"regex = re.compile(r'\\{\\{ *[Cc]ite[ _](?:journal|magazine|paper)\\s*(\\|([^\\{\\}]|\\{\\{[^\\{\\}]*\\}\\})*)\\}\\}', re.MULTILINE) #\\}\\}\n",
"results = ['' for page in pages]\n",
"for i, page in enumerate(pages):\n",
" match = regex.search(page)\n",
" part = match.group(1) # может выдать ошибку\n",
" results[i] = (\n",
"\"\"\"|-\n",
"| №%d\n",
"| %s\n",
"| {{#invoke:Песочница/Wikisaurus/getAnchor|main|{{cite journal|%s}}}}\n",
"| {{#invoke:Песочница/Wikisaurus/getAnchor|main|{{cite journal/песочница|%s}}}}\n",
"\"\"\") % (i + 1, page_names_sample[i], part, part)\n",
"print('Выделено для всех', len(results), 'страниц')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"if results[i] == '':\n",
" print('Ошибка на странице', i)\n",
" print(pages[i])"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Сохранено в template inclusion sample.txt\n"
]
}
],
"source": [
"# сохранить в файл\n",
"import os\n",
"filename = 'template inclusion sample.txt'\n",
"if not os.path.exists(filename):\n",
" with open(filename, 'w', encoding='utf8') as file:\n",
" file.write(\n",
"\"\"\"{| class=\"standard\"\n",
"!№\n",
"!Статья\n",
"!style=\"width:45%\"| Старое оформление\n",
"!style=\"width:45%\"| Новое оформление\n",
"\"\"\" +\n",
"''.join(results) + \n",
"\"\"\"|}\"\"\")\n",
" print('Сохранено в', filename)\n",
"else:\n",
" print('Не сохранено: файл', filename, 'уже существует')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|
|