Меню

Главная
Случайная статья
Настройки
Участник:Wikisaurus/Данные/template inclusion examples.ipynb
Материал из https://ru.wikipedia.org

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Страниц: 19277\n"
     ]
    }
   ],
   "source": [
    "# названия страниц из файла\n",
    "with open('template inclusion names.txt', 'r', encoding='utf8') as file:\n",
    "    page_names = file.readlines()\n",
    "page_names = [str.strip() for str in page_names]\n",
    "print('Страниц:', len(page_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Случайная выборка: 500\n",
      "Начало: ['(132524) APL', '(1941) Вильд', '(279) Туле']\n"
     ]
    }
   ],
   "source": [
    "# случайная выборка\n",
    "import random\n",
    "page_names_sample = sorted(random.sample(page_names, 500))\n",
    "print('Случайная выборка:', len(page_names_sample))\n",
    "print('Начало:', page_names_sample[:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Страниц сохранено:  500\n"
     ]
    }
   ],
   "source": [
    "# код включений из интернета\n",
    "import urllib\n",
    "import IPython\n",
    "pages = ['' for i in page_names_sample]\n",
    "for i, page_name in enumerate(page_names_sample):\n",
    "    url = 'https://ru.wikipedia.org/w/index.php?action=raw&title=' + urllib.parse.quote(page_name)\n",
    "    wiki = urllib.request.urlopen(url).read().decode('utf-8')\n",
    "    pages[i] = wiki\n",
    "    IPython.display.clear_output()\n",
    "    print('Страниц сохранено: ', i + 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pages[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Выделено для всех 500 страниц\n"
     ]
    }
   ],
   "source": [
    "# сформировать строку таблицы из кода страницы\n",
    "import re #''\n",
    "# press[ _]release|press\n",
    "# journal|magazine|paper\n",
    "regex = re.compile(r'\\{\\{ *[Cc]ite[ _](?:journal|magazine|paper)\\s*(\\|([^\\{\\}]|\\{\\{[^\\{\\}]*\\}\\})*)\\}\\}', re.MULTILINE) #\\}\\}\n",
    "results = ['' for page in pages]\n",
    "for i, page in enumerate(pages):\n",
    "    match = regex.search(page)\n",
    "    part = match.group(1) # может выдать ошибку\n",
    "    results[i] = (\n",
    "\"\"\"|-\n",
    "| №%d\n",
    "| %s\n",
    "| {{#invoke:Песочница/Wikisaurus/getAnchor|main|{{cite journal|%s}}}}\n",
    "| {{#invoke:Песочница/Wikisaurus/getAnchor|main|{{cite journal/песочница|%s}}}}\n",
    "\"\"\") % (i + 1, page_names_sample[i], part, part)\n",
    "print('Выделено для всех', len(results), 'страниц')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "if results[i] == '':\n",
    "    print('Ошибка на странице', i)\n",
    "    print(pages[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Сохранено в template inclusion sample.txt\n"
     ]
    }
   ],
   "source": [
    "# сохранить в файл\n",
    "import os\n",
    "filename = 'template inclusion sample.txt'\n",
    "if not os.path.exists(filename):\n",
    "    with open(filename, 'w', encoding='utf8') as file:\n",
    "        file.write(\n",
    "\"\"\"{| class=\"standard\"\n",
    "!№\n",
    "!Статья\n",
    "!style=\"width:45%\"| Старое оформление\n",
    "!style=\"width:45%\"| Новое оформление\n",
    "\"\"\" +\n",
    "''.join(results) + \n",
    "\"\"\"|}\"\"\")\n",
    "    print('Сохранено в', filename)\n",
    "else:\n",
    "    print('Не сохранено: файл', filename, 'уже существует')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
Downgrade Counter