personal_site/markdown_generator/PubsFromBib.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Publications markdown generator for academicpages\n",
    "\n",
    "Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). \n",
    "\n",
    "The core python code is also in `pubsFromBibs.py`. \n",
    "Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:\n",
    "* bib file names\n",
    "* specific venue keys based on your bib file preferences\n",
    "* any specific pre-text for specific files\n",
    "* Collection Name (future feature)\n",
    "\n",
    "TODO: Make this work with other databases of citations, \n",
    "TODO: Merge this with the existing TSV parsing solution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pybtex.database.input import bibtex\n",
    "import pybtex.database.input.bibtex \n",
    "from time import strptime\n",
    "import string\n",
    "import html\n",
    "import os\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#todo: incorporate different collection types rather than a catch all publications, requires other changes to template\n",
    "publist = {\n",
    "    \"proceeding\": {\n",
    "        \"file\" : \"proceedings.bib\",\n",
    "        \"venuekey\": \"booktitle\",\n",
    "        \"venue-pretext\": \"In the proceedings of \",\n",
    "        \"collection\" : {\"name\":\"publications\",\n",
    "                        \"permalink\":\"/publication/\"}\n",
    "        \n",
    "    },\n",
    "    \"journal\":{\n",
    "        \"file\": \"pubs.bib\",\n",
    "        \"venuekey\" : \"journal\",\n",
    "        \"venue-pretext\" : \"\",\n",
    "        \"collection\" : {\"name\":\"publications\",\n",
    "                        \"permalink\":\"/publication/\"}\n",
    "    } \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "html_escape_table = {\n",
    "    \"&\": \"&amp;\",\n",
    "    '\"': \"&quot;\",\n",
    "    \"'\": \"&apos;\"\n",
    "    }\n",
    "\n",
    "def html_escape(text):\n",
    "    \"\"\"Produce entities within text.\"\"\"\n",
    "    return \"\".join(html_escape_table.get(c,c) for c in text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for pubsource in publist:\n",
    "    parser = bibtex.Parser()\n",
    "    bibdata = parser.parse_file(publist[pubsource][\"file\"])\n",
    "\n",
    "    #loop through the individual references in a given bibtex file\n",
    "    for bib_id in bibdata.entries:\n",
    "        #reset default date\n",
    "        pub_year = \"1900\"\n",
    "        pub_month = \"01\"\n",
    "        pub_day = \"01\"\n",
    "        \n",
    "        b = bibdata.entries[bib_id].fields\n",
    "        \n",
    "        try:\n",
    "            pub_year = f'{b[\"year\"]}'\n",
    "\n",
    "            #todo: this hack for month and day needs some cleanup\n",
    "            if \"month\" in b.keys(): \n",
    "                if(len(b[\"month\"])<3):\n",
    "                    pub_month = \"0\"+b[\"month\"]\n",
    "                    pub_month = pub_month[-2:]\n",
    "                elif(b[\"month\"] not in range(12)):\n",
    "                    tmnth = strptime(b[\"month\"][:3],'%b').tm_mon   \n",
    "                    pub_month = \"{:02d}\".format(tmnth) \n",
    "                else:\n",
    "                    pub_month = str(b[\"month\"])\n",
    "            if \"day\" in b.keys(): \n",
    "                pub_day = str(b[\"day\"])\n",
    "\n",
    "                \n",
    "            pub_date = pub_year+\"-\"+pub_month+\"-\"+pub_day\n",
    "            \n",
    "            #strip out {} as needed (some bibtex entries that maintain formatting)\n",
    "            clean_title = b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\").replace(\" \",\"-\")    \n",
    "\n",
    "            url_slug = re.sub(\"\\\\[.*\\\\]|[^a-zA-Z0-9_-]\", \"\", clean_title)\n",
    "            url_slug = url_slug.replace(\"--\",\"-\")\n",
    "\n",
    "            md_filename = (str(pub_date) + \"-\" + url_slug + \".md\").replace(\"--\",\"-\")\n",
    "            html_filename = (str(pub_date) + \"-\" + url_slug).replace(\"--\",\"-\")\n",
    "\n",
    "            #Build Citation from text\n",
    "            citation = \"\"\n",
    "\n",
    "            #citation authors - todo - add highlighting for primary author?\n",
    "            for author in bibdata.entries[bib_id].persons[\"author\"]:\n",
    "                citation = citation+\" \"+author.first_names[0]+\" \"+author.last_names[0]+\", \"\n",
    "\n",
    "            #citation title\n",
    "            citation = citation + \"\\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + \".\\\"\"\n",
    "\n",
    "            #add venue logic depending on citation type\n",
    "            venue = publist[pubsource][\"venue-pretext\"]+b[publist[pubsource][\"venuekey\"]].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")\n",
    "\n",
    "            citation = citation + \" \" + html_escape(venue)\n",
    "            citation = citation + \", \" + pub_year + \".\"\n",
    "\n",
    "            \n",
    "            ## YAML variables\n",
    "            md = \"---\\ntitle: \\\"\"   + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + '\"\\n'\n",
    "            \n",
    "            md += \"\"\"collection: \"\"\" +  publist[pubsource][\"collection\"][\"name\"]\n",
    "\n",
    "            md += \"\"\"\\npermalink: \"\"\" + publist[pubsource][\"collection\"][\"permalink\"]  + html_filename\n",
    "            \n",
    "            note = False\n",
    "            if \"note\" in b.keys():\n",
    "                if len(str(b[\"note\"])) > 5:\n",
    "                    md += \"\\nexcerpt: '\" + html_escape(b[\"note\"]) + \"'\"\n",
    "                    note = True\n",
    "\n",
    "            md += \"\\ndate: \" + str(pub_date) \n",
    "\n",
    "            md += \"\\nvenue: '\" + html_escape(venue) + \"'\"\n",
    "            \n",
    "            url = False\n",
    "            if \"url\" in b.keys():\n",
    "                if len(str(b[\"url\"])) > 5:\n",
    "                    md += \"\\npaperurl: '\" + b[\"url\"] + \"'\"\n",
    "                    url = True\n",
    "\n",
    "            md += \"\\ncitation: '\" + html_escape(citation) + \"'\"\n",
    "\n",
    "            md += \"\\n---\"\n",
    "\n",
    "            \n",
    "            ## Markdown description for individual page\n",
    "            if note:\n",
    "                md += \"\\n\" + html_escape(b[\"note\"]) + \"\\n\"\n",
    "\n",
    "            if url:\n",
    "                md += \"\\n[Access paper here](\" + b[\"url\"] + \"){:target=\\\"_blank\\\"}\\n\" \n",
    "            else:\n",
    "                md += \"\\nUse [Google Scholar](https://scholar.google.com/scholar?q=\"+html.escape(clean_title.replace(\"-\",\"+\"))+\"){:target=\\\"_blank\\\"} for full citation\"\n",
    "\n",
    "            md_filename = os.path.basename(md_filename)\n",
    "\n",
    "            with open(\"../_publications/\" + md_filename, 'w') as f:\n",
    "                f.write(md)\n",
    "            print(f'SUCESSFULLY PARSED {bib_id}: \\\"', b[\"title\"][:60],\"...\"*(len(b['title'])>60),\"\\\"\")\n",
    "        # field may not exist for a reference\n",
    "        except KeyError as e:\n",
    "            print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \\\"', b[\"title\"][:30],\"...\"*(len(b['title'])>30),\"\\\"\")\n",
    "            continue\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}