{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tAM5Qjum-7_n", "outputId": "913075ab-e5f3-44e1-b159-8d305b425bb5" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/468.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.4/468.8 kB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━\u001b[0m \u001b[32m399.4/468.8 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m460.8/468.8 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m468.8/468.8 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.6/8.6 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.3/207.3 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.9/6.9 MB\u001b[0m \u001b[31m16.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.0/83.0 kB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Building wheel for paginate-whoosh (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for paginate (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "--2024-07-19 17:18:01-- https://tufts.box.com/shared/static/325sgkodnq30ez61ugazvctif6r24hsu.csv\n", "Resolving tufts.box.com (tufts.box.com)... 74.112.186.144\n", "Connecting to tufts.box.com (tufts.box.com)|74.112.186.144|:443... connected.\n", "HTTP request sent, awaiting response... 301 Moved Permanently\n", "Location: /public/static/325sgkodnq30ez61ugazvctif6r24hsu.csv [following]\n", "--2024-07-19 17:18:01-- https://tufts.box.com/public/static/325sgkodnq30ez61ugazvctif6r24hsu.csv\n", "Reusing existing connection to tufts.box.com:443.\n", "HTTP request sent, awaiting response... 301 Moved Permanently\n", "Location: https://tufts.app.box.com/public/static/325sgkodnq30ez61ugazvctif6r24hsu.csv [following]\n", "--2024-07-19 17:18:01-- https://tufts.app.box.com/public/static/325sgkodnq30ez61ugazvctif6r24hsu.csv\n", "Resolving tufts.app.box.com (tufts.app.box.com)... 74.112.186.144\n", "Connecting to tufts.app.box.com (tufts.app.box.com)|74.112.186.144|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://public.boxcloud.com/d/1/b1!4Ww4M1AOe5Ieb81DHBK_23rZiXICM9aky6rM99F5iXE5XJOVT2rEpGVrIYCtJ53Wi__34DP5atv7X0A-zBUCIpoxQWn89mNm3mH_2yJdDIvn92GNpDuvrzKhc1XqHyjW98I_MmofTMvgSwFRumxBMCNul2za_BjbZUIAqmWUs2LRLWLM_I33bbWKWxvLhHavdCtjjVBC4mfaBOMW6ecN8JQLhsQa_K4vdH9qfRTHhZtIyZM7ls7NroXobqhR68J3dyYMbhdxmJTp0R2nTWfxQ3cF79foPtIQTFsf3vMx8owE7-8JB23EIBSrm0uF9v5BWMICLMShTyaK8-qom6zb-wUteiNTry_lidw5LsovU1Dytw6QC98iL3FBl4l9e5qBfoLZKPt7ic7BsOCOLFPud5BNjTadWyoAJz-H6T-HtrRtSl4Sr9Q1K_7rbLEgYI88GOZDqseJreav2ux5Y9FfpCtKBi8uj3LePH4ulYu2pXWiFtxAnM3qQy540jHRPEZTfB1t4sKRQOw0MngoaNAsrY0OlKPFvLMLaV2ZuhLA2FQPH3DEHDvuVz6WR7StQt7ZDB1DVifcvNmbC11Ly3pJ98mFhl-Kcwy9MateF7iY9FGJHNZ5BGkpxAT0i5YRXjCqWxwoBxChRNs9f1ywEzREM0lbUTiyXTlEUegbW7A-wpbNNvOQja6EBvhw9CVMlei0Vn8EKfyP-xRboHnZZGrCHUs1Iybsi9mcjM3bIthsqJIgZGoKOVdgSql9-e3TqIqJo5PPmuT9FIeAxUJNS-tTe39P0WGPkM1W22CO6giOQZJQzmMrVK9r5DJIk656jbx7m1jKCuhOtQf3U27GJDe3ABCa_KZ7Oy5IohGHvu6Leux7mHqvrlPVJh_LNr9ahXRhuogZnzB2i3PgVi1ziMshZYkngtqFZgbvk7eBzPmls6iasCFaQFHe_wn6X5kwolsFINjF0SwbR9K3KWNzMiH_ee8AOZnp3RxAefpFYURTT77TrEmSMF_rjGHVFeQOzVvGNyYO3e3i3XzPAWhsyDq5bXXWWHgTdYmjLfXDHR-qCwpBE7P35zq_Rrr5U9ChX-SjejIJQzBQKGYjjCLH5uDrMM1gnQGndMTbuqgGI6kbDqxXSkFI9Cd-h7OzYDWczTi09611axN7AQyGi8DLeZK8fgtVf0k-xBDi8rjtomILB1q4os5VmIjoz6pbwvhaPCZ1bJuyYRvJVwKvabgEsq-a0Iwy8KMvxJrRmmSfIcJ_EUKNug1hqf_M3bjVB4L2mkv45ON7cMrjEq8hB_Otwalv7qDiSIwryS2bOhaxAD4P-7MhOFj0a3umG-PuZr0frJEXn0HOp8PmjRtZ5gOu9m_ER-2f-Sj3gAgaV113R3MBhO_L_g../download [following]\n", "--2024-07-19 17:18:02-- https://public.boxcloud.com/d/1/b1!4Ww4M1AOe5Ieb81DHBK_23rZiXICM9aky6rM99F5iXE5XJOVT2rEpGVrIYCtJ53Wi__34DP5atv7X0A-zBUCIpoxQWn89mNm3mH_2yJdDIvn92GNpDuvrzKhc1XqHyjW98I_MmofTMvgSwFRumxBMCNul2za_BjbZUIAqmWUs2LRLWLM_I33bbWKWxvLhHavdCtjjVBC4mfaBOMW6ecN8JQLhsQa_K4vdH9qfRTHhZtIyZM7ls7NroXobqhR68J3dyYMbhdxmJTp0R2nTWfxQ3cF79foPtIQTFsf3vMx8owE7-8JB23EIBSrm0uF9v5BWMICLMShTyaK8-qom6zb-wUteiNTry_lidw5LsovU1Dytw6QC98iL3FBl4l9e5qBfoLZKPt7ic7BsOCOLFPud5BNjTadWyoAJz-H6T-HtrRtSl4Sr9Q1K_7rbLEgYI88GOZDqseJreav2ux5Y9FfpCtKBi8uj3LePH4ulYu2pXWiFtxAnM3qQy540jHRPEZTfB1t4sKRQOw0MngoaNAsrY0OlKPFvLMLaV2ZuhLA2FQPH3DEHDvuVz6WR7StQt7ZDB1DVifcvNmbC11Ly3pJ98mFhl-Kcwy9MateF7iY9FGJHNZ5BGkpxAT0i5YRXjCqWxwoBxChRNs9f1ywEzREM0lbUTiyXTlEUegbW7A-wpbNNvOQja6EBvhw9CVMlei0Vn8EKfyP-xRboHnZZGrCHUs1Iybsi9mcjM3bIthsqJIgZGoKOVdgSql9-e3TqIqJo5PPmuT9FIeAxUJNS-tTe39P0WGPkM1W22CO6giOQZJQzmMrVK9r5DJIk656jbx7m1jKCuhOtQf3U27GJDe3ABCa_KZ7Oy5IohGHvu6Leux7mHqvrlPVJh_LNr9ahXRhuogZnzB2i3PgVi1ziMshZYkngtqFZgbvk7eBzPmls6iasCFaQFHe_wn6X5kwolsFINjF0SwbR9K3KWNzMiH_ee8AOZnp3RxAefpFYURTT77TrEmSMF_rjGHVFeQOzVvGNyYO3e3i3XzPAWhsyDq5bXXWWHgTdYmjLfXDHR-qCwpBE7P35zq_Rrr5U9ChX-SjejIJQzBQKGYjjCLH5uDrMM1gnQGndMTbuqgGI6kbDqxXSkFI9Cd-h7OzYDWczTi09611axN7AQyGi8DLeZK8fgtVf0k-xBDi8rjtomILB1q4os5VmIjoz6pbwvhaPCZ1bJuyYRvJVwKvabgEsq-a0Iwy8KMvxJrRmmSfIcJ_EUKNug1hqf_M3bjVB4L2mkv45ON7cMrjEq8hB_Otwalv7qDiSIwryS2bOhaxAD4P-7MhOFj0a3umG-PuZr0frJEXn0HOp8PmjRtZ5gOu9m_ER-2f-Sj3gAgaV113R3MBhO_L_g../download\n", "Resolving public.boxcloud.com (public.boxcloud.com)... 74.112.186.130\n", "Connecting to public.boxcloud.com (public.boxcloud.com)|74.112.186.130|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 4643634 (4.4M) [text/csv]\n", "Saving to: ‘daf.csv’\n", "\n", "daf.csv 100%[===================>] 4.43M --.-KB/s in 0.05s \n", "\n", "2024-07-19 17:18:02 (81.4 MB/s) - ‘daf.csv’ saved [4643634/4643634]\n", "\n" ] } ], "source": [ "!pip install whoosh paginate-whoosh streamlit -Uq\n", "!wget https://tufts.box.com/shared/static/325sgkodnq30ez61ugazvctif6r24hsu.csv -O daf.csv" ] }, { "cell_type": "markdown", "source": [ "# Creating a Search Engine for your own data using `Whoosh`\n", "\n", "No matter the discipline, scholars tend to accumulate a vast array of textual sources. Regardless of whether these are primary or seconrdary sources, researchers often need help wading through these sources and finding places where it's best to start digging into the text.\n", "\n", "In this notebook, we'll explore how to create and customize your own search engine so that you can easily and quickly search through your data. We will be using a Python library `Whoosh`, which implements indexing, complex logical queries and searching in pure Python, meaning that it doesn't require a compiler or Java. `Whoosh` is not a search engine itself, but rather a library that allows users to develop their own search engine. " ], "metadata": { "id": "1dGo_JyX_CXL" } }, { "cell_type": "markdown", "source": [ "## Setting up the data\n", "\n", "In this section I'll download some data and put it into a form that is to easy index for `Whoosh`. In this notebook, we'll be searching through Edward Gibbon's *Decline and Fall of the Roman Empire*, a notoriously long and difficult book about the history of Europe from ~200 to ~1400 CE." ], "metadata": { "id": "rAG6znNYA_da" } }, { "cell_type": "code", "source": [ "import pandas as pd\n", "daf = pd.read_csv('daf.csv')[['title','text']]\n", "daf" ], "metadata": { "id": "Rshq8oz3_B6f", "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "outputId": "e53bed05-6ee7-47db-b324-233406194cc5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title \\\n", "0 The Extent Of The Empire In The Age Of The Ant... \n", "1 The Extent Of The Empire In The Age Of The Ant... \n", "2 The Extent Of The Empire In The Age Of The Ant... \n", "3 The Internal Prosperity In The Age Of The Anto... \n", "4 The Internal Prosperity In The Age Of The Anto... \n", ".. ... \n", "291 Final Settlement Of The Ecclesiastical State.—... \n", "292 Final Settlement Of The Ecclesiastical State.—... \n", "293 Final Settlement Of The Ecclesiastical State.—... \n", "294 Prospect Of The Ruins Of Rome In The Fifteenth... \n", "295 Prospect Of The Ruins Of Rome In The Fifteenth... \n", "\n", " text \n", "0 Introduction. The Extent And Military Fo... \n", "1 It was an ancient tradition, that when the Cap... \n", "2 The camp of a Roman legion presented the appea... \n", "3 Of The Union And Internal Prosperity Of The Ro... \n", "4 Till the privileges of Romans had been progres... \n", ".. ... \n", "291 Never perhaps has the energy and effect of a s... \n", "292 Without drawing his sword, count Pepin restore... \n", "293 The royal prerogative of coining money, which ... \n", "294 Prospect Of The Ruins Of Rome In The Fifteenth... \n", "295 These general observations may be separately a... \n", "\n", "[296 rows x 2 columns]" ], "text/html": [ "\n", "
\n", " | title | \n", "text | \n", "
---|---|---|
0 | \n", "The Extent Of The Empire In The Age Of The Ant... | \n", "Introduction. The Extent And Military Fo... | \n", "
1 | \n", "The Extent Of The Empire In The Age Of The Ant... | \n", "It was an ancient tradition, that when the Cap... | \n", "
2 | \n", "The Extent Of The Empire In The Age Of The Ant... | \n", "The camp of a Roman legion presented the appea... | \n", "
3 | \n", "The Internal Prosperity In The Age Of The Anto... | \n", "Of The Union And Internal Prosperity Of The Ro... | \n", "
4 | \n", "The Internal Prosperity In The Age Of The Anto... | \n", "Till the privileges of Romans had been progres... | \n", "
... | \n", "... | \n", "... | \n", "
291 | \n", "Final Settlement Of The Ecclesiastical State.—... | \n", "Never perhaps has the energy and effect of a s... | \n", "
292 | \n", "Final Settlement Of The Ecclesiastical State.—... | \n", "Without drawing his sword, count Pepin restore... | \n", "
293 | \n", "Final Settlement Of The Ecclesiastical State.—... | \n", "The royal prerogative of coining money, which ... | \n", "
294 | \n", "Prospect Of The Ruins Of Rome In The Fifteenth... | \n", "Prospect Of The Ruins Of Rome In The Fifteenth... | \n", "
295 | \n", "Prospect Of The Ruins Of Rome In The Fifteenth... | \n", "These general observations may be separately a... | \n", "
296 rows × 2 columns
\n", "{hit}
\n", "