Upload end-sem pyq for IR, november-december 2025. Provided by Ayush Kalaskar.

Added may june 2025 IR pyq. Provided by Afan Shaikh.
Added end-sem pyq answers for unit 6. Collaborative work by Ayush Kalaskar and Himanshu Patil.
2026-03-22 02:12:40 +05:30 · 2025-12-07 22:57:13 +05:30 · 2025-12-02 23:40:00 +05:30 · 2025-12-02 23:15:51 +05:30 · 2025-12-02 15:10:31 +05:30 · 2025-12-02 15:09:54 +05:30
32 changed files with 1340 additions and 0 deletions
@@ -0,0 +1,66 @@
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 import numpy as np
 import nltk
 # Download necessary NLTK data
 nltk.download("punkt")
 nltk.download("stopwords")
 def process(file):
    # Read the file
    raw = open(file).read()
    # Tokenize the raw text
    tokens = word_tokenize(raw)
    words = [w.lower() for w in tokens]
    # Stem the tokens
    porter = nltk.PorterStemmer()
    stemmed_tokens = [porter.stem(t) for t in words]
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]
    # Count words
    count = nltk.defaultdict(int)
    for word in filtered_tokens:
        count[word] += 1
    return count
 def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)
 def getSimilarity(dict1, dict2):
    all_words_list = []
    # Collect all unique words from both dictionaries
    for key in dict1:
        all_words_list.append(key)
    for key in dict2:
        all_words_list.append(key)
    all_words_list_size = len(all_words_list)
    v1 = np.zeros(all_words_list_size, dtype=np.int)
    v2 = np.zeros(all_words_list_size, dtype=np.int)
    # Create vectors for the dictionaries
    for i, key in enumerate(all_words_list):
        v1[i] = dict1.get(key, 0)
        v2[i] = dict2.get(key, 0)
    return cos_sim(v1, v2)
 if __name__ == '__main__':
    dict1 = process("text1.txt")
    dict2 = process("text2.txt")
    print("Similarity between two text documents:", getSimilarity(dict1, dict2))
@@ -0,0 +1,57 @@
 import numpy as np
 # Constants for PageRank
 threshold = 1e-13
 beta = 0.85
 # Spider Trap Network represented as adjacency matrix
 A = [
    [0, 0, 1, 0],
    [1, 0, 0, 0],
    [1, 1, 0, 0],
    [1, 1, 0, 1]
 ]
 # Convert adjacency matrix to a numpy array
 arr = np.array(A, dtype=float)
 # Calculate summation of columns
 s = []
 for i in range(len(A)):
    s.append(np.sum(arr[:, i]))
 print("Summation of columns: ", s)
 # Create the column stochastic probability matrix, M
 M = arr.copy()
 for j in range(len(A)):
    if s[j] != 0:  # Prevent division by zero
        M[:, j] = M[:, j] / s[j]
 print("Column stochastic probability matrix, M:")
 print(M)
 # Initialize rank vector
 r = (1.0 + np.zeros([len(M), 1])) / len(M)
 print("Initial rank vector:")
 print(r)
 # Calculate the uniform rank contribution
 uniformR = (1.0 - beta) * r
 r_prev = r.copy()
 # PageRank iterations
 for i in range(1, 1001):
    print("Iteration: ", i)
    r = beta * np.matmul(M, r_prev) + uniformR
    print("The rank vector: ")
    print(r)
    diff = np.sum(abs(r - r_prev))
    if diff < threshold:
        break
    r_prev = r.copy()
 # Display the final rank vector
 print("The final rank vector: ")
 print(r[:, 0])
@@ -0,0 +1,30 @@
 # Import libraries
 import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 # Ensure you have the necessary NLTK resources downloaded
 nltk.download('punkt')
 nltk.download('stopwords')
 def remove_stop_words(text):
    # Tokenizing the text into words
    words = word_tokenize(text)
    # Defining the English stop words
    stop_words = set(stopwords.words('english'))
    # Removing stop words from the text
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)
 # Example usage
 if __name__ == "__main__":
    input_text = "This is an example of a text document that needs stop word removal."
    preprocessed_text = remove_stop_words(input_text)
    print("Original Text:")
    print(input_text)
    print("\nPreprocessed Text:")
    print(preprocessed_text)
@@ -0,0 +1,31 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 public class CharacterCountDriver {
    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            System.err.println("Usage: CharacterCountDriver <input path> <output path>");
            System.exit(-1);
        }
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "Character Count");
        job.setJarByClass(CharacterCountDriver.class);
        job.setMapperClass(CharacterCountMapper.class);
        job.setCombinerClass(CharacterCountReducer.class);
        job.setReducerClass(CharacterCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
 }
@@ -0,0 +1,21 @@
 import java.io.IOException;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 public class CharacterCountMapper extends Mapper<Object, Text, Text, IntWritable> {
    private final static IntWritable one = new IntWritable(1);
    private Text character = new Text();
    @Override
    protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString().toLowerCase();
        for (char c : line.toCharArray()) {
            if (Character.isAlphabetic(c)) {
                character.set(String.valueOf(c));
                context.write(character, one);
            }
        }
    }
 }
@@ -0,0 +1,19 @@
 import java.io.IOException;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Reducer;
 public class CharacterCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    private IntWritable result = new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int count = 0;
        for (IntWritable val : values) {
            count += val.get();
        }
        result.set(count);
        context.write(key, result);
    }
 }
@@ -0,0 +1,74 @@
 ### List of Commands
 1. **Create a Directory for Your Project**:
   ```bash
   mkdir ~/hadoop_char_count
   cd ~/hadoop_char_count
   ```
 2. **Compile the Java Files**:
   ```bash
   javac -classpath $(hadoop classpath) -d . CharacterCountMapper.java CharacterCountReducer.java CharacterCountDriver.java
   ```
 3. **Create the JAR File**:
   ```bash
   jar cvf CharacterCount.jar *.class
   ```
 4. **Create Input Directory in HDFS** (if needed):
   ```bash
   hdfs dfs -mkdir -p /user/hduser/input
   ```
 5. **Upload Input File to HDFS**:
   ```bash
   hdfs dfs -put /path/to/your/local/input.txt /user/hduser/input/
   ```
 6. **Run the MapReduce Job**:
   ```bash
   hadoop jar CharacterCount.jar CharacterCountDriver /user/hduser/input /user/hduser/output
   ```
 7. **Remove Existing Output Directory** (if needed):
   ```bash
   hdfs dfs -rm -r /user/hduser/output
   ```
 8. **List Contents of the Output Directory**:
   ```bash
   hdfs dfs -ls /user/hduser/output
   ```
 9. **View the Output File**:
   ```bash
   hdfs dfs -cat /user/hduser/output/part-r-00000
   ```
 10. **View Output with `more` or `less`**:
    ```bash
    hdfs dfs -cat /user/hduser/output/part-r-00000 | more
    ```
    or
    ```bash
    hdfs dfs -cat /user/hduser/output/part-r-00000 | less
    ```
 11. **Copy Output to Local File System (Optional)**:
    ```bash
    hdfs dfs -get /user/hduser/output/part-r-00000 /path/to/local/directory/
    ```
 ---
@@ -0,0 +1,35 @@
 # pip install requests beautifulsoup4
 import requests
 from bs4 import BeautifulSoup
 import time
 def crawl(url, depth):
    if depth == 0:
        return
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
    except requests.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return
    soup = BeautifulSoup(response.text, 'html.parser')
    print(f"Crawling: {url}")
    # Find all links in the HTML
    links = set()
    for link in soup.find_all('a', href=True):
        full_url = link['href']
        if full_url.startswith('http'):
            links.add(full_url)
    # Recursively crawl each link
    for link in links:
        time.sleep(1)  # Be polite and avoid overwhelming the server
        crawl(link, depth - 1)
 if __name__ == "__main__":
    start_url = input("Enter the URL to crawl: ")
    crawl_depth = int(input("Enter the crawl depth: "))
    crawl(start_url, crawl_depth)
@@ -0,0 +1,230 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "946b7d7c-1e3a-4421-83ac-48c77a022c18",
   "metadata": {},
   "source": [
    "# Practical-1.1\n",
    "\n",
    "Problem Statement: Write a program to Compute Similarity between two text documents.\n",
    "\n",
    "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "efe12052-a191-4760-9a75-a08d82b3d334",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "import numpy as np\n",
    "import nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /home/nonroot/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package punkt_tab to\n",
      "[nltk_data]     /home/nonroot/nltk_data...\n",
      "[nltk_data]   Package punkt_tab is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Download necessary NLTK data\n",
    "nltk.download(\"punkt\")\n",
    "nltk.download(\"stopwords\")\n",
    "nltk.download('punkt_tab')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Contents of text1.txt:\n",
      "This is a sample document. It contains text for testing the similarity.\n",
      "\n",
      "\n",
      "Contents of text2.txt:\n",
      "This document is a sample. It includes text to test the similarity.\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Print contents of the two documents\n",
    "def print_file_content(file):\n",
    "    with open(file, 'r') as f:\n",
    "        content = f.read()\n",
    "        print(content)\n",
    "\n",
    "print(\"Contents of text1.txt:\")\n",
    "print_file_content(\"text1.txt\")\n",
    "print(\"Contents of text2.txt:\")\n",
    "print_file_content(\"text2.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process(file):\n",
    "    # Read the file\n",
    "    raw = open(file).read()\n",
    "    \n",
    "    # Tokenize the raw text\n",
    "    tokens = word_tokenize(raw)\n",
    "    words = [w.lower() for w in tokens]\n",
    "    \n",
    "    # Stem the tokens\n",
    "    porter = nltk.PorterStemmer()\n",
    "    stemmed_tokens = [porter.stem(t) for t in words]\n",
    "\n",
    "    # Removing stop words\n",
    "    stop_words = set(stopwords.words('english'))\n",
    "    filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n",
    "    \n",
    "    # Count words\n",
    "    count = nltk.defaultdict(int)\n",
    "    for word in filtered_tokens:\n",
    "        count[word] += 1\n",
    "        \n",
    "    return count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "56f17214-bc46-4eaf-aeed-ce387212c9b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cos_sim(a, b):\n",
    "    dot_product = np.dot(a, b)\n",
    "    norm_a = np.linalg.norm(a)\n",
    "    norm_b = np.linalg.norm(b)\n",
    "    \n",
    "    return dot_product / (norm_a * norm_b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "6c1c3993-9909-4cb7-aaa5-a69714667afd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def getSimilarity(dict1, dict2):\n",
    "    all_words_list = []\n",
    "    \n",
    "    # Collect all unique words from both dictionaries\n",
    "    for key in dict1:\n",
    "        all_words_list.append(key)\n",
    "        \n",
    "    for key in dict2:\n",
    "        all_words_list.append(key)\n",
    "        \n",
    "    all_words_list_size = len(all_words_list)\n",
    "    v1 = np.zeros(all_words_list_size, dtype=int)  # Changed np.int to int\n",
    "    v2 = np.zeros(all_words_list_size, dtype=int)  # Changed np.int to int\n",
    "    \n",
    "    # Create vectors for the dictionaries\n",
    "    for i, key in enumerate(all_words_list):\n",
    "        v1[i] = dict1.get(key, 0)\n",
    "        v2[i] = dict2.get(key, 0)\n",
    "        \n",
    "    return cos_sim(v1, v2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Similarity between two text documents: 0.9523809523809523\n"
     ]
    }
   ],
   "source": [
    "if __name__ == '__main__':\n",
    "    dict1 = process(\"text1.txt\")\n",
    "    dict2 = process(\"text2.txt\")\n",
    "    \n",
    "    print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a32301be-d57c-4892-b0b3-094a05f61f9a",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
@@ -0,0 +1,2 @@
 This is a sample document. It contains text for testing the similarity.
@@ -0,0 +1,2 @@
 This document is a sample. It includes text to test the similarity.
@@ -0,0 +1,616 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "af1d39a1-915d-44e2-b06f-49777bfe4cf6",
   "metadata": {},
   "source": [
    "# Practical-1.2\n",
    "\n",
    "Problem Statement: Implement Page Rank Algorithm.\n",
    "\n",
    "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fcd4c298-e888-44ee-93d9-b9d3f3a9b05f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6d446fd6-e2ab-46d4-b9ee-ea1baa3e0b76",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Constants for PageRank\n",
    "threshold = 1e-13\n",
    "beta = 0.85"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "25966376-d37f-41ef-a1ca-adbdf5831bd3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Spider Trap Network represented as adjacency matrix\n",
    "A = [\n",
    "    [0, 0, 1, 0],\n",
    "    [1, 0, 0, 0],\n",
    "    [1, 1, 0, 0],\n",
    "    [1, 1, 0, 1]\n",
    "]\n",
    "\n",
    "# Convert adjacency matrix to a numpy array\n",
    "arr = np.array(A, dtype=float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "e9932efe-ba91-4bd8-9e1b-aa96ea1fbc5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summation of columns:  [3.0, 2.0, 1.0, 1.0]\n"
     ]
    }
   ],
   "source": [
    "# Calculate summation of columns\n",
    "s = []\n",
    "for i in range(len(A)):\n",
    "    s.append(np.sum(arr[:, i]))\n",
    "\n",
    "print(\"Summation of columns: \", s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5f41e472-4f23-4a83-ac92-737581dd566c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column stochastic probability matrix, M:\n",
      "[[0.         0.         1.         0.        ]\n",
      " [0.33333333 0.         0.         0.        ]\n",
      " [0.33333333 0.5        0.         0.        ]\n",
      " [0.33333333 0.5        0.         1.        ]]\n"
     ]
    }
   ],
   "source": [
    "# Create the column stochastic probability matrix, M\n",
    "M = arr.copy()\n",
    "for j in range(len(A)):\n",
    "    if s[j] != 0:  # Prevent division by zero\n",
    "        M[:, j] = M[:, j] / s[j]\n",
    "\n",
    "print(\"Column stochastic probability matrix, M:\")\n",
    "print(M)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "e0c63b43-1825-4edb-873b-bab9d2e2f3d3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initial rank vector:\n",
      "[[0.25]\n",
      " [0.25]\n",
      " [0.25]\n",
      " [0.25]]\n"
     ]
    }
   ],
   "source": [
    "# Initialize rank vector\n",
    "r = (1.0 + np.zeros([len(M), 1])) / len(M)\n",
    "print(\"Initial rank vector:\")\n",
    "print(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "f540571b-5fd7-4ced-a8a5-7daeb4625f18",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate the uniform rank contribution\n",
    "uniformR = (1.0 - beta) * r\n",
    "r_prev = r.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "b0d7f809-f901-4bf0-9676-ea4ea976a33a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Iteration:  1\n",
      "The rank vector: \n",
      "[[0.25      ]\n",
      " [0.10833333]\n",
      " [0.21458333]\n",
      " [0.42708333]]\n",
      "Iteration:  2\n",
      "The rank vector: \n",
      "[[0.21989583]\n",
      " [0.10833333]\n",
      " [0.154375  ]\n",
      " [0.51739583]]\n",
      "Iteration:  3\n",
      "The rank vector: \n",
      "[[0.16871875]\n",
      " [0.09980382]\n",
      " [0.14584549]\n",
      " [0.58563194]]\n",
      "Iteration:  4\n",
      "The rank vector: \n",
      "[[0.16146866]\n",
      " [0.08530365]\n",
      " [0.12772027]\n",
      " [0.62550742]]\n",
      "Iteration:  5\n",
      "The rank vector: \n",
      "[[0.14606223]\n",
      " [0.08324945]\n",
      " [0.1195035 ]\n",
      " [0.65118481]]\n",
      "Iteration:  6\n",
      "The rank vector: \n",
      "[[0.13907798]\n",
      " [0.0788843 ]\n",
      " [0.11426532]\n",
      " [0.66777241]]\n",
      "Iteration:  7\n",
      "The rank vector: \n",
      "[[0.13462552]\n",
      " [0.07690543]\n",
      " [0.11043125]\n",
      " [0.6780378 ]]\n",
      "Iteration:  8\n",
      "The rank vector: \n",
      "[[0.13136657]\n",
      " [0.0756439 ]\n",
      " [0.1083287 ]\n",
      " [0.68466083]]\n",
      "Iteration:  9\n",
      "The rank vector: \n",
      "[[0.1295794 ]\n",
      " [0.07472053]\n",
      " [0.10686918]\n",
      " [0.68883089]]\n",
      "Iteration:  10\n",
      "The rank vector: \n",
      "[[0.12833881]\n",
      " [0.07421416]\n",
      " [0.10597039]\n",
      " [0.69147664]]\n",
      "Iteration:  11\n",
      "The rank vector: \n",
      "[[0.12757483]\n",
      " [0.07386266]\n",
      " [0.10540368]\n",
      " [0.69315883]]\n",
      "Iteration:  12\n",
      "The rank vector: \n",
      "[[0.12709313]\n",
      " [0.0736462 ]\n",
      " [0.10503783]\n",
      " [0.69422284]]\n",
      "Iteration:  13\n",
      "The rank vector: \n",
      "[[0.12678216]\n",
      " [0.07350972]\n",
      " [0.10480936]\n",
      " [0.69489877]]\n",
      "Iteration:  14\n",
      "The rank vector: \n",
      "[[0.12658795]\n",
      " [0.07342161]\n",
      " [0.10466324]\n",
      " [0.69532719]]\n",
      "Iteration:  15\n",
      "The rank vector: \n",
      "[[0.12646376]\n",
      " [0.07336659]\n",
      " [0.10457077]\n",
      " [0.69559889]]\n",
      "Iteration:  16\n",
      "The rank vector: \n",
      "[[0.12638516]\n",
      " [0.0733314 ]\n",
      " [0.1045122 ]\n",
      " [0.69577125]]\n",
      "Iteration:  17\n",
      "The rank vector: \n",
      "[[0.12633537]\n",
      " [0.07330913]\n",
      " [0.10447497]\n",
      " [0.69588053]]\n",
      "Iteration:  18\n",
      "The rank vector: \n",
      "[[0.12630373]\n",
      " [0.07329502]\n",
      " [0.1044514 ]\n",
      " [0.69594985]]\n",
      "Iteration:  19\n",
      "The rank vector: \n",
      "[[0.12628369]\n",
      " [0.07328606]\n",
      " [0.10443644]\n",
      " [0.69599382]]\n",
      "Iteration:  20\n",
      "The rank vector: \n",
      "[[0.12627097]\n",
      " [0.07328038]\n",
      " [0.10442695]\n",
      " [0.6960217 ]]\n",
      "Iteration:  21\n",
      "The rank vector: \n",
      "[[0.12626291]\n",
      " [0.07327678]\n",
      " [0.10442094]\n",
      " [0.69603938]]\n",
      "Iteration:  22\n",
      "The rank vector: \n",
      "[[0.1262578 ]\n",
      " [0.07327449]\n",
      " [0.10441712]\n",
      " [0.69605059]]\n",
      "Iteration:  23\n",
      "The rank vector: \n",
      "[[0.12625455]\n",
      " [0.07327304]\n",
      " [0.1044147 ]\n",
      " [0.6960577 ]]\n",
      "Iteration:  24\n",
      "The rank vector: \n",
      "[[0.1262525 ]\n",
      " [0.07327212]\n",
      " [0.10441317]\n",
      " [0.69606221]]\n",
      "Iteration:  25\n",
      "The rank vector: \n",
      "[[0.12625119]\n",
      " [0.07327154]\n",
      " [0.10441219]\n",
      " [0.69606508]]\n",
      "Iteration:  26\n",
      "The rank vector: \n",
      "[[0.12625036]\n",
      " [0.07327117]\n",
      " [0.10441158]\n",
      " [0.69606689]]\n",
      "Iteration:  27\n",
      "The rank vector: \n",
      "[[0.12624984]\n",
      " [0.07327094]\n",
      " [0.10441118]\n",
      " [0.69606804]]\n",
      "Iteration:  28\n",
      "The rank vector: \n",
      "[[0.12624951]\n",
      " [0.07327079]\n",
      " [0.10441094]\n",
      " [0.69606877]]\n",
      "Iteration:  29\n",
      "The rank vector: \n",
      "[[0.1262493 ]\n",
      " [0.07327069]\n",
      " [0.10441078]\n",
      " [0.69606923]]\n",
      "Iteration:  30\n",
      "The rank vector: \n",
      "[[0.12624916]\n",
      " [0.07327063]\n",
      " [0.10441068]\n",
      " [0.69606953]]\n",
      "Iteration:  31\n",
      "The rank vector: \n",
      "[[0.12624908]\n",
      " [0.0732706 ]\n",
      " [0.10441062]\n",
      " [0.69606971]]\n",
      "Iteration:  32\n",
      "The rank vector: \n",
      "[[0.12624902]\n",
      " [0.07327057]\n",
      " [0.10441057]\n",
      " [0.69606983]]\n",
      "Iteration:  33\n",
      "The rank vector: \n",
      "[[0.12624899]\n",
      " [0.07327056]\n",
      " [0.10441055]\n",
      " [0.69606991]]\n",
      "Iteration:  34\n",
      "The rank vector: \n",
      "[[0.12624897]\n",
      " [0.07327055]\n",
      " [0.10441053]\n",
      " [0.69606995]]\n",
      "Iteration:  35\n",
      "The rank vector: \n",
      "[[0.12624895]\n",
      " [0.07327054]\n",
      " [0.10441052]\n",
      " [0.69606998]]\n",
      "Iteration:  36\n",
      "The rank vector: \n",
      "[[0.12624894]\n",
      " [0.07327054]\n",
      " [0.10441052]\n",
      " [0.69607   ]]\n",
      "Iteration:  37\n",
      "The rank vector: \n",
      "[[0.12624894]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607001]]\n",
      "Iteration:  38\n",
      "The rank vector: \n",
      "[[0.12624894]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607002]]\n",
      "Iteration:  39\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607003]]\n",
      "Iteration:  40\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607003]]\n",
      "Iteration:  41\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607003]]\n",
      "Iteration:  42\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607003]]\n",
      "Iteration:  43\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607003]]\n",
      "Iteration:  44\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607003]]\n",
      "Iteration:  45\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607003]]\n",
      "Iteration:  46\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607003]]\n",
      "Iteration:  47\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607003]]\n",
      "Iteration:  48\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  49\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  50\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  51\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  52\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  53\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  54\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  55\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  56\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  57\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  58\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  59\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  60\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  61\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  62\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  63\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  64\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n",
      "Iteration:  65\n",
      "The rank vector: \n",
      "[[0.12624893]\n",
      " [0.07327053]\n",
      " [0.10441051]\n",
      " [0.69607004]]\n"
     ]
    }
   ],
   "source": [
    "# PageRank iterations\n",
    "for i in range(1, 1001):\n",
    "    print(\"Iteration: \", i)\n",
    "    r = beta * np.matmul(M, r_prev) + uniformR\n",
    "    print(\"The rank vector: \")\n",
    "    print(r)\n",
    "\n",
    "    diff = np.sum(abs(r - r_prev))\n",
    "    if diff < threshold:\n",
    "        break\n",
    "    r_prev = r.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9fddbce3-0f30-4912-bfaa-f71a2d00d385",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The final rank vector: \n",
      "[0.12624893 0.07327053 0.10441051 0.69607004]\n"
     ]
    }
   ],
   "source": [
    "# Display the final rank vector\n",
    "print(\"The final rank vector: \")\n",
    "print(r[:, 0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bcbaa397-957c-4e79-b68a-e2070ee11baf",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
@@ -0,0 +1,144 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ca2da52f-4a43-4db5-bf5d-54bd3506f81e",
   "metadata": {},
   "source": [
    "# Code-1.3\n",
    "\n",
    "Problem Statement: Write a program for Pre-processing of a Text Document: stop word removal.\n",
    "\n",
    "Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f9085aa3-6fc3-432c-8a96-5e6dcb89a900",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "81c78019-0857-4e4a-8235-8d2db97de214",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /home/nonroot/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Download NLTK Resources\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "58c711bf-c052-4314-8103-5f6ce43d41c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stop word removal function\n",
    "def remove_stop_words(text):\n",
    "    # Tokenizing the text into words\n",
    "    words = word_tokenize(text)\n",
    "    \n",
    "    # Defining the English stop words\n",
    "    stop_words = set(stopwords.words('english'))\n",
    "    \n",
    "    # Removing stop words from the text\n",
    "    filtered_words = [word for word in words if word.lower() not in stop_words]\n",
    "    \n",
    "    return ' '.join(filtered_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fb409348-1737-48ac-baad-7a9024914b57",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original Text:\n",
      "This is an example of a text document that needs stop word removal\n",
      "\n",
      "Preprocessed Text:\n",
      "example text document needs stop word removal\n"
     ]
    }
   ],
   "source": [
    "# Main function\n",
    "if __name__ == \"__main__\":\n",
    "    input_text = \"This is an example of a text document that needs stop word removal\"\n",
    "    preprocessed_text = remove_stop_words(input_text)\n",
    "    print(\"Original Text:\")\n",
    "    print(input_text)\n",
    "    print(\"\\nPreprocessed Text:\")\n",
    "    print(preprocessed_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54566bef-20a0-494b-9299-500417834bfd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.20"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
@@ -12,6 +12,18 @@ This repository contains essential resources for the Information Retrieval cours
 ### Codes
 1. [Code-1.1 (Document Similarity)](Codes/Code-1.1.py)
 2. [Code-1.2 (Page Ranking Algorithm)](Codes/Code-1.2.py)
 3. [Code-1.3 (Stopword Removal)](Codes/Code-1.3.py)
 4. [Code-1.4 (Hadoop)](Codes/Code-1.4/)
 5. [Code-1.5 (Simple Web Crawler)](Codes/Code-1.5.py)
 ### Notebooks
 1. [Code-1.1 (Document Similarity)](Notebooks/Code-1.1/)
 2. [Code-1.2 (Page Rank Algorithm)](Notebooks/Code-1.2%20%28Page%20Rank%20Algorithm%29.ipynb)
 3. [Code-1.3 (Stopword Removal)](Notebooks/Code-1.3%20%28Stopword%20Removal%29.ipynb)
 ### Practical
 1. [Practical-1.1](Practical/Practical-1.1/)
@@ -25,6 +37,7 @@ This repository contains essential resources for the Information Retrieval cours
 - [END-SEM](Question%20Papers/END-SEM)
 ### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers)
 ### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers)
 ---
Author	SHA1	Message	Date
notkshitij	cca0f46476	Upload end-sem pyq for IR, november-december 2025. Provided by Ayush Kalaskar.	2026-03-22 02:12:40 +05:30
notkshitij	1190444a91	Added may june 2025 IR pyq. Provided by Afan Shaikh.	2025-12-07 22:57:13 +05:30
notkshitij	4538044099	Added end-sem pyq answers for unit 6. Collaborative work by Ayush Kalaskar and Himanshu Patil.	2025-12-02 23:40:00 +05:30
notkshitij	e635221d58	Added end-sem pyq answers for unit 5. Collaborative work by Ayush Kalaskar and Himanshu Patil.	2025-12-02 23:15:51 +05:30
notkshitij	29826dad3d	Added end-sem pyq answers link.	2025-12-02 15:10:31 +05:30
notkshitij	5283defd66	Added end-sem pyq answers for unit 4. Collaborative work by Ayush Kalaskar and Himanshu Patil.	2025-12-02 15:09:54 +05:30
notkshitij	1f2742dcec	Added end-sem pyq answers. Collaborative work by Ayush Kalaskar and Himanshu Patil.	2025-12-02 14:27:13 +05:30
notkshitij	c1f113c632	Added softcopy for hadoop practical.	2025-10-12 23:47:31 +05:30
notkshitij	658a087c64	Added code for hadoop.	2025-10-12 23:38:15 +05:30
notkshitij	9038747b35	Added links to all the notebooks	2025-10-12 22:57:35 +05:30
notkshitij	68587b84e0	Added jupyter notebooks for 1.1, 1.2, 1.3.	2025-10-12 22:55:54 +05:30
notkshitij	a4f5326402	Added links to all the codes in readme	2025-10-12 22:54:43 +05:30
notkshitij	4c84c01a65	Added codes 1.1, 1.2, 1.3 and 1.5	2025-10-12 22:51:57 +05:30
notkshitij	0ac0a2859b	Added handout and write-up for practical 1.4. Write up by Salvi.	2025-10-12 22:51:30 +05:30
notkshitij	3a5cbc4b22	Added softcopies for practical 1.1, 1.2, 1.3 and 1.5	2025-10-12 22:36:12 +05:30
notkshitij	7ea5113352	Added write-up for practical 1.5.	2025-09-29 23:18:28 +05:30
notkshitij	69799f1c1b	Added formatted handouts for all practical.	2025-09-29 23:18:05 +05:30
		`@@ -0,0 +1,2 @@`
							`This is a sample document. It contains text for testing the similarity.`
		`@@ -0,0 +1,2 @@`
							`This document is a sample. It includes text to test the similarity.`