Files
InformationRetrieval/Notebooks/Code-1.1/Code-1.1 (Document Similarity).ipynb
T

231 lines
6.0 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "946b7d7c-1e3a-4421-83ac-48c77a022c18",
"metadata": {},
"source": [
"# Practical-1.1\n",
"\n",
"Problem Statement: Write a program to Compute Similarity between two text documents.\n",
"\n",
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "efe12052-a191-4760-9a75-a08d82b3d334",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"import numpy as np\n",
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /home/nonroot/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt_tab to\n",
"[nltk_data] /home/nonroot/nltk_data...\n",
"[nltk_data] Package punkt_tab is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Download necessary NLTK data\n",
"nltk.download(\"punkt\")\n",
"nltk.download(\"stopwords\")\n",
"nltk.download('punkt_tab')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Contents of text1.txt:\n",
"This is a sample document. It contains text for testing the similarity.\n",
"\n",
"\n",
"Contents of text2.txt:\n",
"This document is a sample. It includes text to test the similarity.\n",
"\n",
"\n"
]
}
],
"source": [
"# Print contents of the two documents\n",
"def print_file_content(file):\n",
" with open(file, 'r') as f:\n",
" content = f.read()\n",
" print(content)\n",
"\n",
"print(\"Contents of text1.txt:\")\n",
"print_file_content(\"text1.txt\")\n",
"print(\"Contents of text2.txt:\")\n",
"print_file_content(\"text2.txt\")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392",
"metadata": {},
"outputs": [],
"source": [
"def process(file):\n",
" # Read the file\n",
" raw = open(file).read()\n",
" \n",
" # Tokenize the raw text\n",
" tokens = word_tokenize(raw)\n",
" words = [w.lower() for w in tokens]\n",
" \n",
" # Stem the tokens\n",
" porter = nltk.PorterStemmer()\n",
" stemmed_tokens = [porter.stem(t) for t in words]\n",
"\n",
" # Removing stop words\n",
" stop_words = set(stopwords.words('english'))\n",
" filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n",
" \n",
" # Count words\n",
" count = nltk.defaultdict(int)\n",
" for word in filtered_tokens:\n",
" count[word] += 1\n",
" \n",
" return count"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "56f17214-bc46-4eaf-aeed-ce387212c9b1",
"metadata": {},
"outputs": [],
"source": [
"def cos_sim(a, b):\n",
" dot_product = np.dot(a, b)\n",
" norm_a = np.linalg.norm(a)\n",
" norm_b = np.linalg.norm(b)\n",
" \n",
" return dot_product / (norm_a * norm_b)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "6c1c3993-9909-4cb7-aaa5-a69714667afd",
"metadata": {},
"outputs": [],
"source": [
"def getSimilarity(dict1, dict2):\n",
" all_words_list = []\n",
" \n",
" # Collect all unique words from both dictionaries\n",
" for key in dict1:\n",
" all_words_list.append(key)\n",
" \n",
" for key in dict2:\n",
" all_words_list.append(key)\n",
" \n",
" all_words_list_size = len(all_words_list)\n",
" v1 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
" v2 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
" \n",
" # Create vectors for the dictionaries\n",
" for i, key in enumerate(all_words_list):\n",
" v1[i] = dict1.get(key, 0)\n",
" v2[i] = dict2.get(key, 0)\n",
" \n",
" return cos_sim(v1, v2)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Similarity between two text documents: 0.9523809523809523\n"
]
}
],
"source": [
"if __name__ == '__main__':\n",
" dict1 = process(\"text1.txt\")\n",
" dict2 = process(\"text2.txt\")\n",
" \n",
" print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))"
]
},
{
"cell_type": "markdown",
"id": "a32301be-d57c-4892-b0b3-094a05f61f9a",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}