231 lines
6.0 KiB
Plaintext
231 lines
6.0 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "946b7d7c-1e3a-4421-83ac-48c77a022c18",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Practical-1.1\n",
|
|
"\n",
|
|
"Problem Statement: Write a program to Compute Similarity between two text documents.\n",
|
|
"\n",
|
|
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
|
|
"\n",
|
|
"---"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "efe12052-a191-4760-9a75-a08d82b3d334",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Import libraries\n",
|
|
"from nltk.corpus import stopwords\n",
|
|
"from nltk.tokenize import word_tokenize\n",
|
|
"import numpy as np\n",
|
|
"import nltk"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
|
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
|
"[nltk_data] Downloading package stopwords to\n",
|
|
"[nltk_data] /home/nonroot/nltk_data...\n",
|
|
"[nltk_data] Package stopwords is already up-to-date!\n",
|
|
"[nltk_data] Downloading package punkt_tab to\n",
|
|
"[nltk_data] /home/nonroot/nltk_data...\n",
|
|
"[nltk_data] Package punkt_tab is already up-to-date!\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"execution_count": 21,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Download necessary NLTK data\n",
|
|
"nltk.download(\"punkt\")\n",
|
|
"nltk.download(\"stopwords\")\n",
|
|
"nltk.download('punkt_tab')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Contents of text1.txt:\n",
|
|
"This is a sample document. It contains text for testing the similarity.\n",
|
|
"\n",
|
|
"\n",
|
|
"Contents of text2.txt:\n",
|
|
"This document is a sample. It includes text to test the similarity.\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Print contents of the two documents\n",
|
|
"def print_file_content(file):\n",
|
|
" with open(file, 'r') as f:\n",
|
|
" content = f.read()\n",
|
|
" print(content)\n",
|
|
"\n",
|
|
"print(\"Contents of text1.txt:\")\n",
|
|
"print_file_content(\"text1.txt\")\n",
|
|
"print(\"Contents of text2.txt:\")\n",
|
|
"print_file_content(\"text2.txt\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def process(file):\n",
|
|
" # Read the file\n",
|
|
" raw = open(file).read()\n",
|
|
" \n",
|
|
" # Tokenize the raw text\n",
|
|
" tokens = word_tokenize(raw)\n",
|
|
" words = [w.lower() for w in tokens]\n",
|
|
" \n",
|
|
" # Stem the tokens\n",
|
|
" porter = nltk.PorterStemmer()\n",
|
|
" stemmed_tokens = [porter.stem(t) for t in words]\n",
|
|
"\n",
|
|
" # Removing stop words\n",
|
|
" stop_words = set(stopwords.words('english'))\n",
|
|
" filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n",
|
|
" \n",
|
|
" # Count words\n",
|
|
" count = nltk.defaultdict(int)\n",
|
|
" for word in filtered_tokens:\n",
|
|
" count[word] += 1\n",
|
|
" \n",
|
|
" return count"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"id": "56f17214-bc46-4eaf-aeed-ce387212c9b1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def cos_sim(a, b):\n",
|
|
" dot_product = np.dot(a, b)\n",
|
|
" norm_a = np.linalg.norm(a)\n",
|
|
" norm_b = np.linalg.norm(b)\n",
|
|
" \n",
|
|
" return dot_product / (norm_a * norm_b)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"id": "6c1c3993-9909-4cb7-aaa5-a69714667afd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def getSimilarity(dict1, dict2):\n",
|
|
" all_words_list = []\n",
|
|
" \n",
|
|
" # Collect all unique words from both dictionaries\n",
|
|
" for key in dict1:\n",
|
|
" all_words_list.append(key)\n",
|
|
" \n",
|
|
" for key in dict2:\n",
|
|
" all_words_list.append(key)\n",
|
|
" \n",
|
|
" all_words_list_size = len(all_words_list)\n",
|
|
" v1 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
|
|
" v2 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
|
|
" \n",
|
|
" # Create vectors for the dictionaries\n",
|
|
" for i, key in enumerate(all_words_list):\n",
|
|
" v1[i] = dict1.get(key, 0)\n",
|
|
" v2[i] = dict2.get(key, 0)\n",
|
|
" \n",
|
|
" return cos_sim(v1, v2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Similarity between two text documents: 0.9523809523809523\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"if __name__ == '__main__':\n",
|
|
" dict1 = process(\"text1.txt\")\n",
|
|
" dict2 = process(\"text2.txt\")\n",
|
|
" \n",
|
|
" print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a32301be-d57c-4892-b0b3-094a05f61f9a",
|
|
"metadata": {},
|
|
"source": [
|
|
"---"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.20"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|