Compare commits

...

17 Commits

Author SHA1 Message Date
notkshitij cca0f46476 Upload end-sem pyq for IR, november-december 2025. Provided by Ayush Kalaskar. 2026-03-22 02:12:40 +05:30
notkshitij 1190444a91 Added may june 2025 IR pyq. Provided by Afan Shaikh. 2025-12-07 22:57:13 +05:30
notkshitij 4538044099 Added end-sem pyq answers for unit 6. Collaborative work by Ayush Kalaskar and Himanshu Patil. 2025-12-02 23:40:00 +05:30
notkshitij e635221d58 Added end-sem pyq answers for unit 5. Collaborative work by Ayush Kalaskar and Himanshu Patil. 2025-12-02 23:15:51 +05:30
notkshitij 29826dad3d Added end-sem pyq answers link. 2025-12-02 15:10:31 +05:30
notkshitij 5283defd66 Added end-sem pyq answers for unit 4. Collaborative work by Ayush Kalaskar and Himanshu Patil. 2025-12-02 15:09:54 +05:30
notkshitij 1f2742dcec Added end-sem pyq answers. Collaborative work by Ayush Kalaskar and Himanshu Patil. 2025-12-02 14:27:13 +05:30
notkshitij c1f113c632 Added softcopy for hadoop practical. 2025-10-12 23:47:31 +05:30
notkshitij 658a087c64 Added code for hadoop. 2025-10-12 23:38:15 +05:30
notkshitij 9038747b35 Added links to all the notebooks 2025-10-12 22:57:35 +05:30
notkshitij 68587b84e0 Added jupyter notebooks for 1.1, 1.2, 1.3. 2025-10-12 22:55:54 +05:30
notkshitij a4f5326402 Added links to all the codes in readme 2025-10-12 22:54:43 +05:30
notkshitij 4c84c01a65 Added codes 1.1, 1.2, 1.3 and 1.5 2025-10-12 22:51:57 +05:30
notkshitij 0ac0a2859b Added handout and write-up for practical 1.4. Write up by Salvi. 2025-10-12 22:51:30 +05:30
notkshitij 3a5cbc4b22 Added softcopies for practical 1.1, 1.2, 1.3 and 1.5 2025-10-12 22:36:12 +05:30
notkshitij 7ea5113352 Added write-up for practical 1.5. 2025-09-29 23:18:28 +05:30
notkshitij 69799f1c1b Added formatted handouts for all practical. 2025-09-29 23:18:05 +05:30
32 changed files with 1340 additions and 0 deletions
+66
View File
@@ -0,0 +1,66 @@
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
def process(file):
# Read the file
raw = open(file).read()
# Tokenize the raw text
tokens = word_tokenize(raw)
words = [w.lower() for w in tokens]
# Stem the tokens
porter = nltk.PorterStemmer()
stemmed_tokens = [porter.stem(t) for t in words]
# Removing stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]
# Count words
count = nltk.defaultdict(int)
for word in filtered_tokens:
count[word] += 1
return count
def cos_sim(a, b):
dot_product = np.dot(a, b)
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
return dot_product / (norm_a * norm_b)
def getSimilarity(dict1, dict2):
all_words_list = []
# Collect all unique words from both dictionaries
for key in dict1:
all_words_list.append(key)
for key in dict2:
all_words_list.append(key)
all_words_list_size = len(all_words_list)
v1 = np.zeros(all_words_list_size, dtype=np.int)
v2 = np.zeros(all_words_list_size, dtype=np.int)
# Create vectors for the dictionaries
for i, key in enumerate(all_words_list):
v1[i] = dict1.get(key, 0)
v2[i] = dict2.get(key, 0)
return cos_sim(v1, v2)
if __name__ == '__main__':
dict1 = process("text1.txt")
dict2 = process("text2.txt")
print("Similarity between two text documents:", getSimilarity(dict1, dict2))
+57
View File
@@ -0,0 +1,57 @@
import numpy as np
# Constants for PageRank
threshold = 1e-13
beta = 0.85
# Spider Trap Network represented as adjacency matrix
A = [
[0, 0, 1, 0],
[1, 0, 0, 0],
[1, 1, 0, 0],
[1, 1, 0, 1]
]
# Convert adjacency matrix to a numpy array
arr = np.array(A, dtype=float)
# Calculate summation of columns
s = []
for i in range(len(A)):
s.append(np.sum(arr[:, i]))
print("Summation of columns: ", s)
# Create the column stochastic probability matrix, M
M = arr.copy()
for j in range(len(A)):
if s[j] != 0: # Prevent division by zero
M[:, j] = M[:, j] / s[j]
print("Column stochastic probability matrix, M:")
print(M)
# Initialize rank vector
r = (1.0 + np.zeros([len(M), 1])) / len(M)
print("Initial rank vector:")
print(r)
# Calculate the uniform rank contribution
uniformR = (1.0 - beta) * r
r_prev = r.copy()
# PageRank iterations
for i in range(1, 1001):
print("Iteration: ", i)
r = beta * np.matmul(M, r_prev) + uniformR
print("The rank vector: ")
print(r)
diff = np.sum(abs(r - r_prev))
if diff < threshold:
break
r_prev = r.copy()
# Display the final rank vector
print("The final rank vector: ")
print(r[:, 0])
+30
View File
@@ -0,0 +1,30 @@
# Import libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Ensure you have the necessary NLTK resources downloaded
nltk.download('punkt')
nltk.download('stopwords')
def remove_stop_words(text):
# Tokenizing the text into words
words = word_tokenize(text)
# Defining the English stop words
stop_words = set(stopwords.words('english'))
# Removing stop words from the text
filtered_words = [word for word in words if word.lower() not in stop_words]
return ' '.join(filtered_words)
# Example usage
if __name__ == "__main__":
input_text = "This is an example of a text document that needs stop word removal."
preprocessed_text = remove_stop_words(input_text)
print("Original Text:")
print(input_text)
print("\nPreprocessed Text:")
print(preprocessed_text)
+31
View File
@@ -0,0 +1,31 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CharacterCountDriver {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: CharacterCountDriver <input path> <output path>");
System.exit(-1);
}
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "Character Count");
job.setJarByClass(CharacterCountDriver.class);
job.setMapperClass(CharacterCountMapper.class);
job.setCombinerClass(CharacterCountReducer.class);
job.setReducerClass(CharacterCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
+21
View File
@@ -0,0 +1,21 @@
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class CharacterCountMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text character = new Text();
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString().toLowerCase();
for (char c : line.toCharArray()) {
if (Character.isAlphabetic(c)) {
character.set(String.valueOf(c));
context.write(character, one);
}
}
}
}
+19
View File
@@ -0,0 +1,19 @@
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class CharacterCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable val : values) {
count += val.get();
}
result.set(count);
context.write(key, result);
}
}
+74
View File
@@ -0,0 +1,74 @@
### List of Commands
1. **Create a Directory for Your Project**:
```bash
mkdir ~/hadoop_char_count
cd ~/hadoop_char_count
```
2. **Compile the Java Files**:
```bash
javac -classpath $(hadoop classpath) -d . CharacterCountMapper.java CharacterCountReducer.java CharacterCountDriver.java
```
3. **Create the JAR File**:
```bash
jar cvf CharacterCount.jar *.class
```
4. **Create Input Directory in HDFS** (if needed):
```bash
hdfs dfs -mkdir -p /user/hduser/input
```
5. **Upload Input File to HDFS**:
```bash
hdfs dfs -put /path/to/your/local/input.txt /user/hduser/input/
```
6. **Run the MapReduce Job**:
```bash
hadoop jar CharacterCount.jar CharacterCountDriver /user/hduser/input /user/hduser/output
```
7. **Remove Existing Output Directory** (if needed):
```bash
hdfs dfs -rm -r /user/hduser/output
```
8. **List Contents of the Output Directory**:
```bash
hdfs dfs -ls /user/hduser/output
```
9. **View the Output File**:
```bash
hdfs dfs -cat /user/hduser/output/part-r-00000
```
10. **View Output with `more` or `less`**:
```bash
hdfs dfs -cat /user/hduser/output/part-r-00000 | more
```
or
```bash
hdfs dfs -cat /user/hduser/output/part-r-00000 | less
```
11. **Copy Output to Local File System (Optional)**:
```bash
hdfs dfs -get /user/hduser/output/part-r-00000 /path/to/local/directory/
```
---
+35
View File
@@ -0,0 +1,35 @@
# pip install requests beautifulsoup4
import requests
from bs4 import BeautifulSoup
import time
def crawl(url, depth):
if depth == 0:
return
try:
response = requests.get(url)
response.raise_for_status() # Check for HTTP errors
except requests.RequestException as e:
print(f"Failed to retrieve {url}: {e}")
return
soup = BeautifulSoup(response.text, 'html.parser')
print(f"Crawling: {url}")
# Find all links in the HTML
links = set()
for link in soup.find_all('a', href=True):
full_url = link['href']
if full_url.startswith('http'):
links.add(full_url)
# Recursively crawl each link
for link in links:
time.sleep(1) # Be polite and avoid overwhelming the server
crawl(link, depth - 1)
if __name__ == "__main__":
start_url = input("Enter the URL to crawl: ")
crawl_depth = int(input("Enter the crawl depth: "))
crawl(start_url, crawl_depth)
@@ -0,0 +1,230 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "946b7d7c-1e3a-4421-83ac-48c77a022c18",
"metadata": {},
"source": [
"# Practical-1.1\n",
"\n",
"Problem Statement: Write a program to Compute Similarity between two text documents.\n",
"\n",
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "efe12052-a191-4760-9a75-a08d82b3d334",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"import numpy as np\n",
"import nltk"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /home/nonroot/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt_tab to\n",
"[nltk_data] /home/nonroot/nltk_data...\n",
"[nltk_data] Package punkt_tab is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Download necessary NLTK data\n",
"nltk.download(\"punkt\")\n",
"nltk.download(\"stopwords\")\n",
"nltk.download('punkt_tab')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Contents of text1.txt:\n",
"This is a sample document. It contains text for testing the similarity.\n",
"\n",
"\n",
"Contents of text2.txt:\n",
"This document is a sample. It includes text to test the similarity.\n",
"\n",
"\n"
]
}
],
"source": [
"# Print contents of the two documents\n",
"def print_file_content(file):\n",
" with open(file, 'r') as f:\n",
" content = f.read()\n",
" print(content)\n",
"\n",
"print(\"Contents of text1.txt:\")\n",
"print_file_content(\"text1.txt\")\n",
"print(\"Contents of text2.txt:\")\n",
"print_file_content(\"text2.txt\")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392",
"metadata": {},
"outputs": [],
"source": [
"def process(file):\n",
" # Read the file\n",
" raw = open(file).read()\n",
" \n",
" # Tokenize the raw text\n",
" tokens = word_tokenize(raw)\n",
" words = [w.lower() for w in tokens]\n",
" \n",
" # Stem the tokens\n",
" porter = nltk.PorterStemmer()\n",
" stemmed_tokens = [porter.stem(t) for t in words]\n",
"\n",
" # Removing stop words\n",
" stop_words = set(stopwords.words('english'))\n",
" filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n",
" \n",
" # Count words\n",
" count = nltk.defaultdict(int)\n",
" for word in filtered_tokens:\n",
" count[word] += 1\n",
" \n",
" return count"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "56f17214-bc46-4eaf-aeed-ce387212c9b1",
"metadata": {},
"outputs": [],
"source": [
"def cos_sim(a, b):\n",
" dot_product = np.dot(a, b)\n",
" norm_a = np.linalg.norm(a)\n",
" norm_b = np.linalg.norm(b)\n",
" \n",
" return dot_product / (norm_a * norm_b)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "6c1c3993-9909-4cb7-aaa5-a69714667afd",
"metadata": {},
"outputs": [],
"source": [
"def getSimilarity(dict1, dict2):\n",
" all_words_list = []\n",
" \n",
" # Collect all unique words from both dictionaries\n",
" for key in dict1:\n",
" all_words_list.append(key)\n",
" \n",
" for key in dict2:\n",
" all_words_list.append(key)\n",
" \n",
" all_words_list_size = len(all_words_list)\n",
" v1 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
" v2 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
" \n",
" # Create vectors for the dictionaries\n",
" for i, key in enumerate(all_words_list):\n",
" v1[i] = dict1.get(key, 0)\n",
" v2[i] = dict2.get(key, 0)\n",
" \n",
" return cos_sim(v1, v2)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Similarity between two text documents: 0.9523809523809523\n"
]
}
],
"source": [
"if __name__ == '__main__':\n",
" dict1 = process(\"text1.txt\")\n",
" dict2 = process(\"text2.txt\")\n",
" \n",
" print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))"
]
},
{
"cell_type": "markdown",
"id": "a32301be-d57c-4892-b0b3-094a05f61f9a",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+2
View File
@@ -0,0 +1,2 @@
This is a sample document. It contains text for testing the similarity.
+2
View File
@@ -0,0 +1,2 @@
This document is a sample. It includes text to test the similarity.
@@ -0,0 +1,616 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "af1d39a1-915d-44e2-b06f-49777bfe4cf6",
"metadata": {},
"source": [
"# Practical-1.2\n",
"\n",
"Problem Statement: Implement Page Rank Algorithm.\n",
"\n",
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "fcd4c298-e888-44ee-93d9-b9d3f3a9b05f",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "6d446fd6-e2ab-46d4-b9ee-ea1baa3e0b76",
"metadata": {},
"outputs": [],
"source": [
"# Constants for PageRank\n",
"threshold = 1e-13\n",
"beta = 0.85"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "25966376-d37f-41ef-a1ca-adbdf5831bd3",
"metadata": {},
"outputs": [],
"source": [
"# Spider Trap Network represented as adjacency matrix\n",
"A = [\n",
" [0, 0, 1, 0],\n",
" [1, 0, 0, 0],\n",
" [1, 1, 0, 0],\n",
" [1, 1, 0, 1]\n",
"]\n",
"\n",
"# Convert adjacency matrix to a numpy array\n",
"arr = np.array(A, dtype=float)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e9932efe-ba91-4bd8-9e1b-aa96ea1fbc5b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Summation of columns: [3.0, 2.0, 1.0, 1.0]\n"
]
}
],
"source": [
"# Calculate summation of columns\n",
"s = []\n",
"for i in range(len(A)):\n",
" s.append(np.sum(arr[:, i]))\n",
"\n",
"print(\"Summation of columns: \", s)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "5f41e472-4f23-4a83-ac92-737581dd566c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Column stochastic probability matrix, M:\n",
"[[0. 0. 1. 0. ]\n",
" [0.33333333 0. 0. 0. ]\n",
" [0.33333333 0.5 0. 0. ]\n",
" [0.33333333 0.5 0. 1. ]]\n"
]
}
],
"source": [
"# Create the column stochastic probability matrix, M\n",
"M = arr.copy()\n",
"for j in range(len(A)):\n",
" if s[j] != 0: # Prevent division by zero\n",
" M[:, j] = M[:, j] / s[j]\n",
"\n",
"print(\"Column stochastic probability matrix, M:\")\n",
"print(M)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e0c63b43-1825-4edb-873b-bab9d2e2f3d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial rank vector:\n",
"[[0.25]\n",
" [0.25]\n",
" [0.25]\n",
" [0.25]]\n"
]
}
],
"source": [
"# Initialize rank vector\n",
"r = (1.0 + np.zeros([len(M), 1])) / len(M)\n",
"print(\"Initial rank vector:\")\n",
"print(r)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "f540571b-5fd7-4ced-a8a5-7daeb4625f18",
"metadata": {},
"outputs": [],
"source": [
"# Calculate the uniform rank contribution\n",
"uniformR = (1.0 - beta) * r\n",
"r_prev = r.copy()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b0d7f809-f901-4bf0-9676-ea4ea976a33a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iteration: 1\n",
"The rank vector: \n",
"[[0.25 ]\n",
" [0.10833333]\n",
" [0.21458333]\n",
" [0.42708333]]\n",
"Iteration: 2\n",
"The rank vector: \n",
"[[0.21989583]\n",
" [0.10833333]\n",
" [0.154375 ]\n",
" [0.51739583]]\n",
"Iteration: 3\n",
"The rank vector: \n",
"[[0.16871875]\n",
" [0.09980382]\n",
" [0.14584549]\n",
" [0.58563194]]\n",
"Iteration: 4\n",
"The rank vector: \n",
"[[0.16146866]\n",
" [0.08530365]\n",
" [0.12772027]\n",
" [0.62550742]]\n",
"Iteration: 5\n",
"The rank vector: \n",
"[[0.14606223]\n",
" [0.08324945]\n",
" [0.1195035 ]\n",
" [0.65118481]]\n",
"Iteration: 6\n",
"The rank vector: \n",
"[[0.13907798]\n",
" [0.0788843 ]\n",
" [0.11426532]\n",
" [0.66777241]]\n",
"Iteration: 7\n",
"The rank vector: \n",
"[[0.13462552]\n",
" [0.07690543]\n",
" [0.11043125]\n",
" [0.6780378 ]]\n",
"Iteration: 8\n",
"The rank vector: \n",
"[[0.13136657]\n",
" [0.0756439 ]\n",
" [0.1083287 ]\n",
" [0.68466083]]\n",
"Iteration: 9\n",
"The rank vector: \n",
"[[0.1295794 ]\n",
" [0.07472053]\n",
" [0.10686918]\n",
" [0.68883089]]\n",
"Iteration: 10\n",
"The rank vector: \n",
"[[0.12833881]\n",
" [0.07421416]\n",
" [0.10597039]\n",
" [0.69147664]]\n",
"Iteration: 11\n",
"The rank vector: \n",
"[[0.12757483]\n",
" [0.07386266]\n",
" [0.10540368]\n",
" [0.69315883]]\n",
"Iteration: 12\n",
"The rank vector: \n",
"[[0.12709313]\n",
" [0.0736462 ]\n",
" [0.10503783]\n",
" [0.69422284]]\n",
"Iteration: 13\n",
"The rank vector: \n",
"[[0.12678216]\n",
" [0.07350972]\n",
" [0.10480936]\n",
" [0.69489877]]\n",
"Iteration: 14\n",
"The rank vector: \n",
"[[0.12658795]\n",
" [0.07342161]\n",
" [0.10466324]\n",
" [0.69532719]]\n",
"Iteration: 15\n",
"The rank vector: \n",
"[[0.12646376]\n",
" [0.07336659]\n",
" [0.10457077]\n",
" [0.69559889]]\n",
"Iteration: 16\n",
"The rank vector: \n",
"[[0.12638516]\n",
" [0.0733314 ]\n",
" [0.1045122 ]\n",
" [0.69577125]]\n",
"Iteration: 17\n",
"The rank vector: \n",
"[[0.12633537]\n",
" [0.07330913]\n",
" [0.10447497]\n",
" [0.69588053]]\n",
"Iteration: 18\n",
"The rank vector: \n",
"[[0.12630373]\n",
" [0.07329502]\n",
" [0.1044514 ]\n",
" [0.69594985]]\n",
"Iteration: 19\n",
"The rank vector: \n",
"[[0.12628369]\n",
" [0.07328606]\n",
" [0.10443644]\n",
" [0.69599382]]\n",
"Iteration: 20\n",
"The rank vector: \n",
"[[0.12627097]\n",
" [0.07328038]\n",
" [0.10442695]\n",
" [0.6960217 ]]\n",
"Iteration: 21\n",
"The rank vector: \n",
"[[0.12626291]\n",
" [0.07327678]\n",
" [0.10442094]\n",
" [0.69603938]]\n",
"Iteration: 22\n",
"The rank vector: \n",
"[[0.1262578 ]\n",
" [0.07327449]\n",
" [0.10441712]\n",
" [0.69605059]]\n",
"Iteration: 23\n",
"The rank vector: \n",
"[[0.12625455]\n",
" [0.07327304]\n",
" [0.1044147 ]\n",
" [0.6960577 ]]\n",
"Iteration: 24\n",
"The rank vector: \n",
"[[0.1262525 ]\n",
" [0.07327212]\n",
" [0.10441317]\n",
" [0.69606221]]\n",
"Iteration: 25\n",
"The rank vector: \n",
"[[0.12625119]\n",
" [0.07327154]\n",
" [0.10441219]\n",
" [0.69606508]]\n",
"Iteration: 26\n",
"The rank vector: \n",
"[[0.12625036]\n",
" [0.07327117]\n",
" [0.10441158]\n",
" [0.69606689]]\n",
"Iteration: 27\n",
"The rank vector: \n",
"[[0.12624984]\n",
" [0.07327094]\n",
" [0.10441118]\n",
" [0.69606804]]\n",
"Iteration: 28\n",
"The rank vector: \n",
"[[0.12624951]\n",
" [0.07327079]\n",
" [0.10441094]\n",
" [0.69606877]]\n",
"Iteration: 29\n",
"The rank vector: \n",
"[[0.1262493 ]\n",
" [0.07327069]\n",
" [0.10441078]\n",
" [0.69606923]]\n",
"Iteration: 30\n",
"The rank vector: \n",
"[[0.12624916]\n",
" [0.07327063]\n",
" [0.10441068]\n",
" [0.69606953]]\n",
"Iteration: 31\n",
"The rank vector: \n",
"[[0.12624908]\n",
" [0.0732706 ]\n",
" [0.10441062]\n",
" [0.69606971]]\n",
"Iteration: 32\n",
"The rank vector: \n",
"[[0.12624902]\n",
" [0.07327057]\n",
" [0.10441057]\n",
" [0.69606983]]\n",
"Iteration: 33\n",
"The rank vector: \n",
"[[0.12624899]\n",
" [0.07327056]\n",
" [0.10441055]\n",
" [0.69606991]]\n",
"Iteration: 34\n",
"The rank vector: \n",
"[[0.12624897]\n",
" [0.07327055]\n",
" [0.10441053]\n",
" [0.69606995]]\n",
"Iteration: 35\n",
"The rank vector: \n",
"[[0.12624895]\n",
" [0.07327054]\n",
" [0.10441052]\n",
" [0.69606998]]\n",
"Iteration: 36\n",
"The rank vector: \n",
"[[0.12624894]\n",
" [0.07327054]\n",
" [0.10441052]\n",
" [0.69607 ]]\n",
"Iteration: 37\n",
"The rank vector: \n",
"[[0.12624894]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607001]]\n",
"Iteration: 38\n",
"The rank vector: \n",
"[[0.12624894]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607002]]\n",
"Iteration: 39\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 40\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 41\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 42\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 43\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 44\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 45\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 46\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 47\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607003]]\n",
"Iteration: 48\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 49\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 50\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 51\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 52\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 53\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 54\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 55\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 56\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 57\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 58\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 59\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 60\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 61\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 62\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 63\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 64\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n",
"Iteration: 65\n",
"The rank vector: \n",
"[[0.12624893]\n",
" [0.07327053]\n",
" [0.10441051]\n",
" [0.69607004]]\n"
]
}
],
"source": [
"# PageRank iterations\n",
"for i in range(1, 1001):\n",
" print(\"Iteration: \", i)\n",
" r = beta * np.matmul(M, r_prev) + uniformR\n",
" print(\"The rank vector: \")\n",
" print(r)\n",
"\n",
" diff = np.sum(abs(r - r_prev))\n",
" if diff < threshold:\n",
" break\n",
" r_prev = r.copy()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "9fddbce3-0f30-4912-bfaa-f71a2d00d385",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The final rank vector: \n",
"[0.12624893 0.07327053 0.10441051 0.69607004]\n"
]
}
],
"source": [
"# Display the final rank vector\n",
"print(\"The final rank vector: \")\n",
"print(r[:, 0])"
]
},
{
"cell_type": "markdown",
"id": "bcbaa397-957c-4e79-b68a-e2070ee11baf",
"metadata": {},
"source": [
"---"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+144
View File
@@ -0,0 +1,144 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "ca2da52f-4a43-4db5-bf5d-54bd3506f81e",
"metadata": {},
"source": [
"# Code-1.3\n",
"\n",
"Problem Statement: Write a program for Pre-processing of a Text Document: stop word removal.\n",
"\n",
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f9085aa3-6fc3-432c-8a96-5e6dcb89a900",
"metadata": {},
"outputs": [],
"source": [
"# Import libraries\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "81c78019-0857-4e4a-8235-8d2db97de214",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /home/nonroot/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Download NLTK Resources\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "58c711bf-c052-4314-8103-5f6ce43d41c0",
"metadata": {},
"outputs": [],
"source": [
"# Stop word removal function\n",
"def remove_stop_words(text):\n",
" # Tokenizing the text into words\n",
" words = word_tokenize(text)\n",
" \n",
" # Defining the English stop words\n",
" stop_words = set(stopwords.words('english'))\n",
" \n",
" # Removing stop words from the text\n",
" filtered_words = [word for word in words if word.lower() not in stop_words]\n",
" \n",
" return ' '.join(filtered_words)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "fb409348-1737-48ac-baad-7a9024914b57",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Original Text:\n",
"This is an example of a text document that needs stop word removal\n",
"\n",
"Preprocessed Text:\n",
"example text document needs stop word removal\n"
]
}
],
"source": [
"# Main function\n",
"if __name__ == \"__main__\":\n",
" input_text = \"This is an example of a text document that needs stop word removal\"\n",
" preprocessed_text = remove_stop_words(input_text)\n",
" print(\"Original Text:\")\n",
" print(input_text)\n",
" print(\"\\nPreprocessed Text:\")\n",
" print(preprocessed_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54566bef-20a0-494b-9299-500417834bfd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
+13
View File
@@ -12,6 +12,18 @@ This repository contains essential resources for the Information Retrieval cours
### Codes ### Codes
1. [Code-1.1 (Document Similarity)](Codes/Code-1.1.py)
2. [Code-1.2 (Page Ranking Algorithm)](Codes/Code-1.2.py)
3. [Code-1.3 (Stopword Removal)](Codes/Code-1.3.py)
4. [Code-1.4 (Hadoop)](Codes/Code-1.4/)
5. [Code-1.5 (Simple Web Crawler)](Codes/Code-1.5.py)
### Notebooks
1. [Code-1.1 (Document Similarity)](Notebooks/Code-1.1/)
2. [Code-1.2 (Page Rank Algorithm)](Notebooks/Code-1.2%20%28Page%20Rank%20Algorithm%29.ipynb)
3. [Code-1.3 (Stopword Removal)](Notebooks/Code-1.3%20%28Stopword%20Removal%29.ipynb)
### Practical ### Practical
1. [Practical-1.1](Practical/Practical-1.1/) 1. [Practical-1.1](Practical/Practical-1.1/)
@@ -25,6 +37,7 @@ This repository contains essential resources for the Information Retrieval cours
- [END-SEM](Question%20Papers/END-SEM) - [END-SEM](Question%20Papers/END-SEM)
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers) ### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers)
### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers)
--- ---