Compare commits
17 Commits
eac1ded89f
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
cca0f46476
|
|||
|
1190444a91
|
|||
|
4538044099
|
|||
|
e635221d58
|
|||
|
29826dad3d
|
|||
|
5283defd66
|
|||
|
1f2742dcec
|
|||
|
c1f113c632
|
|||
|
658a087c64
|
|||
|
9038747b35
|
|||
|
68587b84e0
|
|||
|
a4f5326402
|
|||
|
4c84c01a65
|
|||
|
0ac0a2859b
|
|||
|
3a5cbc4b22
|
|||
|
7ea5113352
|
|||
|
69799f1c1b
|
@@ -0,0 +1,66 @@
|
|||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import numpy as np
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
# Download necessary NLTK data
|
||||||
|
nltk.download("punkt")
|
||||||
|
nltk.download("stopwords")
|
||||||
|
|
||||||
|
def process(file):
|
||||||
|
# Read the file
|
||||||
|
raw = open(file).read()
|
||||||
|
|
||||||
|
# Tokenize the raw text
|
||||||
|
tokens = word_tokenize(raw)
|
||||||
|
words = [w.lower() for w in tokens]
|
||||||
|
|
||||||
|
# Stem the tokens
|
||||||
|
porter = nltk.PorterStemmer()
|
||||||
|
stemmed_tokens = [porter.stem(t) for t in words]
|
||||||
|
|
||||||
|
# Removing stop words
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]
|
||||||
|
|
||||||
|
# Count words
|
||||||
|
count = nltk.defaultdict(int)
|
||||||
|
for word in filtered_tokens:
|
||||||
|
count[word] += 1
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
def cos_sim(a, b):
|
||||||
|
dot_product = np.dot(a, b)
|
||||||
|
norm_a = np.linalg.norm(a)
|
||||||
|
norm_b = np.linalg.norm(b)
|
||||||
|
|
||||||
|
return dot_product / (norm_a * norm_b)
|
||||||
|
|
||||||
|
def getSimilarity(dict1, dict2):
|
||||||
|
all_words_list = []
|
||||||
|
|
||||||
|
# Collect all unique words from both dictionaries
|
||||||
|
for key in dict1:
|
||||||
|
all_words_list.append(key)
|
||||||
|
|
||||||
|
for key in dict2:
|
||||||
|
all_words_list.append(key)
|
||||||
|
|
||||||
|
all_words_list_size = len(all_words_list)
|
||||||
|
v1 = np.zeros(all_words_list_size, dtype=np.int)
|
||||||
|
v2 = np.zeros(all_words_list_size, dtype=np.int)
|
||||||
|
|
||||||
|
# Create vectors for the dictionaries
|
||||||
|
for i, key in enumerate(all_words_list):
|
||||||
|
v1[i] = dict1.get(key, 0)
|
||||||
|
v2[i] = dict2.get(key, 0)
|
||||||
|
|
||||||
|
return cos_sim(v1, v2)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
dict1 = process("text1.txt")
|
||||||
|
dict2 = process("text2.txt")
|
||||||
|
|
||||||
|
print("Similarity between two text documents:", getSimilarity(dict1, dict2))
|
||||||
|
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Constants for PageRank
|
||||||
|
threshold = 1e-13
|
||||||
|
beta = 0.85
|
||||||
|
|
||||||
|
# Spider Trap Network represented as adjacency matrix
|
||||||
|
A = [
|
||||||
|
[0, 0, 1, 0],
|
||||||
|
[1, 0, 0, 0],
|
||||||
|
[1, 1, 0, 0],
|
||||||
|
[1, 1, 0, 1]
|
||||||
|
]
|
||||||
|
|
||||||
|
# Convert adjacency matrix to a numpy array
|
||||||
|
arr = np.array(A, dtype=float)
|
||||||
|
|
||||||
|
# Calculate summation of columns
|
||||||
|
s = []
|
||||||
|
for i in range(len(A)):
|
||||||
|
s.append(np.sum(arr[:, i]))
|
||||||
|
|
||||||
|
print("Summation of columns: ", s)
|
||||||
|
|
||||||
|
# Create the column stochastic probability matrix, M
|
||||||
|
M = arr.copy()
|
||||||
|
for j in range(len(A)):
|
||||||
|
if s[j] != 0: # Prevent division by zero
|
||||||
|
M[:, j] = M[:, j] / s[j]
|
||||||
|
|
||||||
|
print("Column stochastic probability matrix, M:")
|
||||||
|
print(M)
|
||||||
|
|
||||||
|
# Initialize rank vector
|
||||||
|
r = (1.0 + np.zeros([len(M), 1])) / len(M)
|
||||||
|
print("Initial rank vector:")
|
||||||
|
print(r)
|
||||||
|
|
||||||
|
# Calculate the uniform rank contribution
|
||||||
|
uniformR = (1.0 - beta) * r
|
||||||
|
r_prev = r.copy()
|
||||||
|
|
||||||
|
# PageRank iterations
|
||||||
|
for i in range(1, 1001):
|
||||||
|
print("Iteration: ", i)
|
||||||
|
r = beta * np.matmul(M, r_prev) + uniformR
|
||||||
|
print("The rank vector: ")
|
||||||
|
print(r)
|
||||||
|
|
||||||
|
diff = np.sum(abs(r - r_prev))
|
||||||
|
if diff < threshold:
|
||||||
|
break
|
||||||
|
r_prev = r.copy()
|
||||||
|
|
||||||
|
# Display the final rank vector
|
||||||
|
print("The final rank vector: ")
|
||||||
|
print(r[:, 0])
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# Import libraries
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
# Ensure you have the necessary NLTK resources downloaded
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('stopwords')
|
||||||
|
|
||||||
|
def remove_stop_words(text):
|
||||||
|
# Tokenizing the text into words
|
||||||
|
words = word_tokenize(text)
|
||||||
|
|
||||||
|
# Defining the English stop words
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
|
||||||
|
# Removing stop words from the text
|
||||||
|
filtered_words = [word for word in words if word.lower() not in stop_words]
|
||||||
|
|
||||||
|
return ' '.join(filtered_words)
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
if __name__ == "__main__":
|
||||||
|
input_text = "This is an example of a text document that needs stop word removal."
|
||||||
|
preprocessed_text = remove_stop_words(input_text)
|
||||||
|
print("Original Text:")
|
||||||
|
print(input_text)
|
||||||
|
print("\nPreprocessed Text:")
|
||||||
|
print(preprocessed_text)
|
||||||
|
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.io.IntWritable;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.mapreduce.Job;
|
||||||
|
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
|
||||||
|
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||||
|
|
||||||
|
public class CharacterCountDriver {
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
if (args.length != 2) {
|
||||||
|
System.err.println("Usage: CharacterCountDriver <input path> <output path>");
|
||||||
|
System.exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
Job job = Job.getInstance(conf, "Character Count");
|
||||||
|
job.setJarByClass(CharacterCountDriver.class);
|
||||||
|
job.setMapperClass(CharacterCountMapper.class);
|
||||||
|
job.setCombinerClass(CharacterCountReducer.class);
|
||||||
|
job.setReducerClass(CharacterCountReducer.class);
|
||||||
|
job.setOutputKeyClass(Text.class);
|
||||||
|
job.setOutputValueClass(IntWritable.class);
|
||||||
|
|
||||||
|
FileInputFormat.addInputPath(job, new Path(args[0]));
|
||||||
|
FileOutputFormat.setOutputPath(job, new Path(args[1]));
|
||||||
|
|
||||||
|
System.exit(job.waitForCompletion(true) ? 0 : 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.hadoop.io.IntWritable;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.mapreduce.Mapper;
|
||||||
|
|
||||||
|
public class CharacterCountMapper extends Mapper<Object, Text, Text, IntWritable> {
|
||||||
|
private final static IntWritable one = new IntWritable(1);
|
||||||
|
private Text character = new Text();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
|
||||||
|
String line = value.toString().toLowerCase();
|
||||||
|
for (char c : line.toCharArray()) {
|
||||||
|
if (Character.isAlphabetic(c)) {
|
||||||
|
character.set(String.valueOf(c));
|
||||||
|
context.write(character, one);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.hadoop.io.IntWritable;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.mapreduce.Reducer;
|
||||||
|
|
||||||
|
public class CharacterCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
|
||||||
|
private IntWritable result = new IntWritable();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
|
||||||
|
int count = 0;
|
||||||
|
for (IntWritable val : values) {
|
||||||
|
count += val.get();
|
||||||
|
}
|
||||||
|
result.set(count);
|
||||||
|
context.write(key, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
### List of Commands
|
||||||
|
|
||||||
|
1. **Create a Directory for Your Project**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir ~/hadoop_char_count
|
||||||
|
cd ~/hadoop_char_count
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Compile the Java Files**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
javac -classpath $(hadoop classpath) -d . CharacterCountMapper.java CharacterCountReducer.java CharacterCountDriver.java
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Create the JAR File**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
jar cvf CharacterCount.jar *.class
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Create Input Directory in HDFS** (if needed):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hdfs dfs -mkdir -p /user/hduser/input
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Upload Input File to HDFS**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hdfs dfs -put /path/to/your/local/input.txt /user/hduser/input/
|
||||||
|
```
|
||||||
|
|
||||||
|
6. **Run the MapReduce Job**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hadoop jar CharacterCount.jar CharacterCountDriver /user/hduser/input /user/hduser/output
|
||||||
|
```
|
||||||
|
|
||||||
|
7. **Remove Existing Output Directory** (if needed):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hdfs dfs -rm -r /user/hduser/output
|
||||||
|
```
|
||||||
|
|
||||||
|
8. **List Contents of the Output Directory**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hdfs dfs -ls /user/hduser/output
|
||||||
|
```
|
||||||
|
|
||||||
|
9. **View the Output File**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hdfs dfs -cat /user/hduser/output/part-r-00000
|
||||||
|
```
|
||||||
|
|
||||||
|
10. **View Output with `more` or `less`**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hdfs dfs -cat /user/hduser/output/part-r-00000 | more
|
||||||
|
```
|
||||||
|
or
|
||||||
|
```bash
|
||||||
|
hdfs dfs -cat /user/hduser/output/part-r-00000 | less
|
||||||
|
```
|
||||||
|
|
||||||
|
11. **Copy Output to Local File System (Optional)**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
hdfs dfs -get /user/hduser/output/part-r-00000 /path/to/local/directory/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
# pip install requests beautifulsoup4
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
|
||||||
|
def crawl(url, depth):
|
||||||
|
if depth == 0:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status() # Check for HTTP errors
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Failed to retrieve {url}: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
print(f"Crawling: {url}")
|
||||||
|
|
||||||
|
# Find all links in the HTML
|
||||||
|
links = set()
|
||||||
|
for link in soup.find_all('a', href=True):
|
||||||
|
full_url = link['href']
|
||||||
|
if full_url.startswith('http'):
|
||||||
|
links.add(full_url)
|
||||||
|
|
||||||
|
# Recursively crawl each link
|
||||||
|
for link in links:
|
||||||
|
time.sleep(1) # Be polite and avoid overwhelming the server
|
||||||
|
crawl(link, depth - 1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
start_url = input("Enter the URL to crawl: ")
|
||||||
|
crawl_depth = int(input("Enter the crawl depth: "))
|
||||||
|
crawl(start_url, crawl_depth)
|
||||||
@@ -0,0 +1,230 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "946b7d7c-1e3a-4421-83ac-48c77a022c18",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Practical-1.1\n",
|
||||||
|
"\n",
|
||||||
|
"Problem Statement: Write a program to Compute Similarity between two text documents.\n",
|
||||||
|
"\n",
|
||||||
|
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
|
||||||
|
"\n",
|
||||||
|
"---"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"id": "efe12052-a191-4760-9a75-a08d82b3d334",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import libraries\n",
|
||||||
|
"from nltk.corpus import stopwords\n",
|
||||||
|
"from nltk.tokenize import word_tokenize\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import nltk"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"id": "c8efc1cd-5732-4853-8c92-a03b92ccb9af",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
|
||||||
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
||||||
|
"[nltk_data] Downloading package stopwords to\n",
|
||||||
|
"[nltk_data] /home/nonroot/nltk_data...\n",
|
||||||
|
"[nltk_data] Package stopwords is already up-to-date!\n",
|
||||||
|
"[nltk_data] Downloading package punkt_tab to\n",
|
||||||
|
"[nltk_data] /home/nonroot/nltk_data...\n",
|
||||||
|
"[nltk_data] Package punkt_tab is already up-to-date!\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Download necessary NLTK data\n",
|
||||||
|
"nltk.download(\"punkt\")\n",
|
||||||
|
"nltk.download(\"stopwords\")\n",
|
||||||
|
"nltk.download('punkt_tab')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 34,
|
||||||
|
"id": "fe3bdfe7-91bd-4fcc-96d8-57fcf173605c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Contents of text1.txt:\n",
|
||||||
|
"This is a sample document. It contains text for testing the similarity.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"Contents of text2.txt:\n",
|
||||||
|
"This document is a sample. It includes text to test the similarity.\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print contents of the two documents\n",
|
||||||
|
"def print_file_content(file):\n",
|
||||||
|
" with open(file, 'r') as f:\n",
|
||||||
|
" content = f.read()\n",
|
||||||
|
" print(content)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Contents of text1.txt:\")\n",
|
||||||
|
"print_file_content(\"text1.txt\")\n",
|
||||||
|
"print(\"Contents of text2.txt:\")\n",
|
||||||
|
"print_file_content(\"text2.txt\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"id": "d9e3ad9f-3b5b-4e2d-a62f-6dce24484392",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def process(file):\n",
|
||||||
|
" # Read the file\n",
|
||||||
|
" raw = open(file).read()\n",
|
||||||
|
" \n",
|
||||||
|
" # Tokenize the raw text\n",
|
||||||
|
" tokens = word_tokenize(raw)\n",
|
||||||
|
" words = [w.lower() for w in tokens]\n",
|
||||||
|
" \n",
|
||||||
|
" # Stem the tokens\n",
|
||||||
|
" porter = nltk.PorterStemmer()\n",
|
||||||
|
" stemmed_tokens = [porter.stem(t) for t in words]\n",
|
||||||
|
"\n",
|
||||||
|
" # Removing stop words\n",
|
||||||
|
" stop_words = set(stopwords.words('english'))\n",
|
||||||
|
" filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]\n",
|
||||||
|
" \n",
|
||||||
|
" # Count words\n",
|
||||||
|
" count = nltk.defaultdict(int)\n",
|
||||||
|
" for word in filtered_tokens:\n",
|
||||||
|
" count[word] += 1\n",
|
||||||
|
" \n",
|
||||||
|
" return count"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 36,
|
||||||
|
"id": "56f17214-bc46-4eaf-aeed-ce387212c9b1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def cos_sim(a, b):\n",
|
||||||
|
" dot_product = np.dot(a, b)\n",
|
||||||
|
" norm_a = np.linalg.norm(a)\n",
|
||||||
|
" norm_b = np.linalg.norm(b)\n",
|
||||||
|
" \n",
|
||||||
|
" return dot_product / (norm_a * norm_b)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 37,
|
||||||
|
"id": "6c1c3993-9909-4cb7-aaa5-a69714667afd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def getSimilarity(dict1, dict2):\n",
|
||||||
|
" all_words_list = []\n",
|
||||||
|
" \n",
|
||||||
|
" # Collect all unique words from both dictionaries\n",
|
||||||
|
" for key in dict1:\n",
|
||||||
|
" all_words_list.append(key)\n",
|
||||||
|
" \n",
|
||||||
|
" for key in dict2:\n",
|
||||||
|
" all_words_list.append(key)\n",
|
||||||
|
" \n",
|
||||||
|
" all_words_list_size = len(all_words_list)\n",
|
||||||
|
" v1 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
|
||||||
|
" v2 = np.zeros(all_words_list_size, dtype=int) # Changed np.int to int\n",
|
||||||
|
" \n",
|
||||||
|
" # Create vectors for the dictionaries\n",
|
||||||
|
" for i, key in enumerate(all_words_list):\n",
|
||||||
|
" v1[i] = dict1.get(key, 0)\n",
|
||||||
|
" v2[i] = dict2.get(key, 0)\n",
|
||||||
|
" \n",
|
||||||
|
" return cos_sim(v1, v2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 38,
|
||||||
|
"id": "13f1e8f5-c8a1-4415-8901-641aa0e2cb5b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Similarity between two text documents: 0.9523809523809523\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"if __name__ == '__main__':\n",
|
||||||
|
" dict1 = process(\"text1.txt\")\n",
|
||||||
|
" dict2 = process(\"text2.txt\")\n",
|
||||||
|
" \n",
|
||||||
|
" print(\"Similarity between two text documents:\", getSimilarity(dict1, dict2))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "a32301be-d57c-4892-b0b3-094a05f61f9a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"---"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.20"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
This is a sample document. It contains text for testing the similarity.
|
||||||
|
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
This document is a sample. It includes text to test the similarity.
|
||||||
|
|
||||||
@@ -0,0 +1,616 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "af1d39a1-915d-44e2-b06f-49777bfe4cf6",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Practical-1.2\n",
|
||||||
|
"\n",
|
||||||
|
"Problem Statement: Implement Page Rank Algorithm.\n",
|
||||||
|
"\n",
|
||||||
|
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
|
||||||
|
"\n",
|
||||||
|
"---"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "fcd4c298-e888-44ee-93d9-b9d3f3a9b05f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import libraries\n",
|
||||||
|
"import numpy as np"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "6d446fd6-e2ab-46d4-b9ee-ea1baa3e0b76",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Constants for PageRank\n",
|
||||||
|
"threshold = 1e-13\n",
|
||||||
|
"beta = 0.85"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "25966376-d37f-41ef-a1ca-adbdf5831bd3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Spider Trap Network represented as adjacency matrix\n",
|
||||||
|
"A = [\n",
|
||||||
|
" [0, 0, 1, 0],\n",
|
||||||
|
" [1, 0, 0, 0],\n",
|
||||||
|
" [1, 1, 0, 0],\n",
|
||||||
|
" [1, 1, 0, 1]\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"# Convert adjacency matrix to a numpy array\n",
|
||||||
|
"arr = np.array(A, dtype=float)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"id": "e9932efe-ba91-4bd8-9e1b-aa96ea1fbc5b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Summation of columns: [3.0, 2.0, 1.0, 1.0]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Calculate summation of columns\n",
|
||||||
|
"s = []\n",
|
||||||
|
"for i in range(len(A)):\n",
|
||||||
|
" s.append(np.sum(arr[:, i]))\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Summation of columns: \", s)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"id": "5f41e472-4f23-4a83-ac92-737581dd566c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Column stochastic probability matrix, M:\n",
|
||||||
|
"[[0. 0. 1. 0. ]\n",
|
||||||
|
" [0.33333333 0. 0. 0. ]\n",
|
||||||
|
" [0.33333333 0.5 0. 0. ]\n",
|
||||||
|
" [0.33333333 0.5 0. 1. ]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create the column stochastic probability matrix, M\n",
|
||||||
|
"M = arr.copy()\n",
|
||||||
|
"for j in range(len(A)):\n",
|
||||||
|
" if s[j] != 0: # Prevent division by zero\n",
|
||||||
|
" M[:, j] = M[:, j] / s[j]\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Column stochastic probability matrix, M:\")\n",
|
||||||
|
"print(M)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"id": "e0c63b43-1825-4edb-873b-bab9d2e2f3d3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Initial rank vector:\n",
|
||||||
|
"[[0.25]\n",
|
||||||
|
" [0.25]\n",
|
||||||
|
" [0.25]\n",
|
||||||
|
" [0.25]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Initialize rank vector\n",
|
||||||
|
"r = (1.0 + np.zeros([len(M), 1])) / len(M)\n",
|
||||||
|
"print(\"Initial rank vector:\")\n",
|
||||||
|
"print(r)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"id": "f540571b-5fd7-4ced-a8a5-7daeb4625f18",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Calculate the uniform rank contribution\n",
|
||||||
|
"uniformR = (1.0 - beta) * r\n",
|
||||||
|
"r_prev = r.copy()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"id": "b0d7f809-f901-4bf0-9676-ea4ea976a33a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Iteration: 1\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.25 ]\n",
|
||||||
|
" [0.10833333]\n",
|
||||||
|
" [0.21458333]\n",
|
||||||
|
" [0.42708333]]\n",
|
||||||
|
"Iteration: 2\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.21989583]\n",
|
||||||
|
" [0.10833333]\n",
|
||||||
|
" [0.154375 ]\n",
|
||||||
|
" [0.51739583]]\n",
|
||||||
|
"Iteration: 3\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.16871875]\n",
|
||||||
|
" [0.09980382]\n",
|
||||||
|
" [0.14584549]\n",
|
||||||
|
" [0.58563194]]\n",
|
||||||
|
"Iteration: 4\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.16146866]\n",
|
||||||
|
" [0.08530365]\n",
|
||||||
|
" [0.12772027]\n",
|
||||||
|
" [0.62550742]]\n",
|
||||||
|
"Iteration: 5\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.14606223]\n",
|
||||||
|
" [0.08324945]\n",
|
||||||
|
" [0.1195035 ]\n",
|
||||||
|
" [0.65118481]]\n",
|
||||||
|
"Iteration: 6\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.13907798]\n",
|
||||||
|
" [0.0788843 ]\n",
|
||||||
|
" [0.11426532]\n",
|
||||||
|
" [0.66777241]]\n",
|
||||||
|
"Iteration: 7\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.13462552]\n",
|
||||||
|
" [0.07690543]\n",
|
||||||
|
" [0.11043125]\n",
|
||||||
|
" [0.6780378 ]]\n",
|
||||||
|
"Iteration: 8\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.13136657]\n",
|
||||||
|
" [0.0756439 ]\n",
|
||||||
|
" [0.1083287 ]\n",
|
||||||
|
" [0.68466083]]\n",
|
||||||
|
"Iteration: 9\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.1295794 ]\n",
|
||||||
|
" [0.07472053]\n",
|
||||||
|
" [0.10686918]\n",
|
||||||
|
" [0.68883089]]\n",
|
||||||
|
"Iteration: 10\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12833881]\n",
|
||||||
|
" [0.07421416]\n",
|
||||||
|
" [0.10597039]\n",
|
||||||
|
" [0.69147664]]\n",
|
||||||
|
"Iteration: 11\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12757483]\n",
|
||||||
|
" [0.07386266]\n",
|
||||||
|
" [0.10540368]\n",
|
||||||
|
" [0.69315883]]\n",
|
||||||
|
"Iteration: 12\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12709313]\n",
|
||||||
|
" [0.0736462 ]\n",
|
||||||
|
" [0.10503783]\n",
|
||||||
|
" [0.69422284]]\n",
|
||||||
|
"Iteration: 13\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12678216]\n",
|
||||||
|
" [0.07350972]\n",
|
||||||
|
" [0.10480936]\n",
|
||||||
|
" [0.69489877]]\n",
|
||||||
|
"Iteration: 14\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12658795]\n",
|
||||||
|
" [0.07342161]\n",
|
||||||
|
" [0.10466324]\n",
|
||||||
|
" [0.69532719]]\n",
|
||||||
|
"Iteration: 15\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12646376]\n",
|
||||||
|
" [0.07336659]\n",
|
||||||
|
" [0.10457077]\n",
|
||||||
|
" [0.69559889]]\n",
|
||||||
|
"Iteration: 16\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12638516]\n",
|
||||||
|
" [0.0733314 ]\n",
|
||||||
|
" [0.1045122 ]\n",
|
||||||
|
" [0.69577125]]\n",
|
||||||
|
"Iteration: 17\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12633537]\n",
|
||||||
|
" [0.07330913]\n",
|
||||||
|
" [0.10447497]\n",
|
||||||
|
" [0.69588053]]\n",
|
||||||
|
"Iteration: 18\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12630373]\n",
|
||||||
|
" [0.07329502]\n",
|
||||||
|
" [0.1044514 ]\n",
|
||||||
|
" [0.69594985]]\n",
|
||||||
|
"Iteration: 19\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12628369]\n",
|
||||||
|
" [0.07328606]\n",
|
||||||
|
" [0.10443644]\n",
|
||||||
|
" [0.69599382]]\n",
|
||||||
|
"Iteration: 20\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12627097]\n",
|
||||||
|
" [0.07328038]\n",
|
||||||
|
" [0.10442695]\n",
|
||||||
|
" [0.6960217 ]]\n",
|
||||||
|
"Iteration: 21\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12626291]\n",
|
||||||
|
" [0.07327678]\n",
|
||||||
|
" [0.10442094]\n",
|
||||||
|
" [0.69603938]]\n",
|
||||||
|
"Iteration: 22\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.1262578 ]\n",
|
||||||
|
" [0.07327449]\n",
|
||||||
|
" [0.10441712]\n",
|
||||||
|
" [0.69605059]]\n",
|
||||||
|
"Iteration: 23\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12625455]\n",
|
||||||
|
" [0.07327304]\n",
|
||||||
|
" [0.1044147 ]\n",
|
||||||
|
" [0.6960577 ]]\n",
|
||||||
|
"Iteration: 24\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.1262525 ]\n",
|
||||||
|
" [0.07327212]\n",
|
||||||
|
" [0.10441317]\n",
|
||||||
|
" [0.69606221]]\n",
|
||||||
|
"Iteration: 25\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12625119]\n",
|
||||||
|
" [0.07327154]\n",
|
||||||
|
" [0.10441219]\n",
|
||||||
|
" [0.69606508]]\n",
|
||||||
|
"Iteration: 26\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12625036]\n",
|
||||||
|
" [0.07327117]\n",
|
||||||
|
" [0.10441158]\n",
|
||||||
|
" [0.69606689]]\n",
|
||||||
|
"Iteration: 27\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624984]\n",
|
||||||
|
" [0.07327094]\n",
|
||||||
|
" [0.10441118]\n",
|
||||||
|
" [0.69606804]]\n",
|
||||||
|
"Iteration: 28\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624951]\n",
|
||||||
|
" [0.07327079]\n",
|
||||||
|
" [0.10441094]\n",
|
||||||
|
" [0.69606877]]\n",
|
||||||
|
"Iteration: 29\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.1262493 ]\n",
|
||||||
|
" [0.07327069]\n",
|
||||||
|
" [0.10441078]\n",
|
||||||
|
" [0.69606923]]\n",
|
||||||
|
"Iteration: 30\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624916]\n",
|
||||||
|
" [0.07327063]\n",
|
||||||
|
" [0.10441068]\n",
|
||||||
|
" [0.69606953]]\n",
|
||||||
|
"Iteration: 31\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624908]\n",
|
||||||
|
" [0.0732706 ]\n",
|
||||||
|
" [0.10441062]\n",
|
||||||
|
" [0.69606971]]\n",
|
||||||
|
"Iteration: 32\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624902]\n",
|
||||||
|
" [0.07327057]\n",
|
||||||
|
" [0.10441057]\n",
|
||||||
|
" [0.69606983]]\n",
|
||||||
|
"Iteration: 33\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624899]\n",
|
||||||
|
" [0.07327056]\n",
|
||||||
|
" [0.10441055]\n",
|
||||||
|
" [0.69606991]]\n",
|
||||||
|
"Iteration: 34\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624897]\n",
|
||||||
|
" [0.07327055]\n",
|
||||||
|
" [0.10441053]\n",
|
||||||
|
" [0.69606995]]\n",
|
||||||
|
"Iteration: 35\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624895]\n",
|
||||||
|
" [0.07327054]\n",
|
||||||
|
" [0.10441052]\n",
|
||||||
|
" [0.69606998]]\n",
|
||||||
|
"Iteration: 36\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624894]\n",
|
||||||
|
" [0.07327054]\n",
|
||||||
|
" [0.10441052]\n",
|
||||||
|
" [0.69607 ]]\n",
|
||||||
|
"Iteration: 37\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624894]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607001]]\n",
|
||||||
|
"Iteration: 38\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624894]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607002]]\n",
|
||||||
|
"Iteration: 39\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607003]]\n",
|
||||||
|
"Iteration: 40\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607003]]\n",
|
||||||
|
"Iteration: 41\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607003]]\n",
|
||||||
|
"Iteration: 42\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607003]]\n",
|
||||||
|
"Iteration: 43\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607003]]\n",
|
||||||
|
"Iteration: 44\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607003]]\n",
|
||||||
|
"Iteration: 45\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607003]]\n",
|
||||||
|
"Iteration: 46\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607003]]\n",
|
||||||
|
"Iteration: 47\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607003]]\n",
|
||||||
|
"Iteration: 48\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 49\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 50\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 51\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 52\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 53\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 54\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 55\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 56\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 57\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 58\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 59\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 60\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 61\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 62\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 63\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 64\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n",
|
||||||
|
"Iteration: 65\n",
|
||||||
|
"The rank vector: \n",
|
||||||
|
"[[0.12624893]\n",
|
||||||
|
" [0.07327053]\n",
|
||||||
|
" [0.10441051]\n",
|
||||||
|
" [0.69607004]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# PageRank iterations\n",
|
||||||
|
"for i in range(1, 1001):\n",
|
||||||
|
" print(\"Iteration: \", i)\n",
|
||||||
|
" r = beta * np.matmul(M, r_prev) + uniformR\n",
|
||||||
|
" print(\"The rank vector: \")\n",
|
||||||
|
" print(r)\n",
|
||||||
|
"\n",
|
||||||
|
" diff = np.sum(abs(r - r_prev))\n",
|
||||||
|
" if diff < threshold:\n",
|
||||||
|
" break\n",
|
||||||
|
" r_prev = r.copy()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "9fddbce3-0f30-4912-bfaa-f71a2d00d385",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The final rank vector: \n",
|
||||||
|
"[0.12624893 0.07327053 0.10441051 0.69607004]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Display the final rank vector\n",
|
||||||
|
"print(\"The final rank vector: \")\n",
|
||||||
|
"print(r[:, 0])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "bcbaa397-957c-4e79-b68a-e2070ee11baf",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"---"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.20"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@@ -0,0 +1,144 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ca2da52f-4a43-4db5-bf5d-54bd3506f81e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Code-1.3\n",
|
||||||
|
"\n",
|
||||||
|
"Problem Statement: Write a program for Pre-processing of a Text Document: stop word removal.\n",
|
||||||
|
"\n",
|
||||||
|
"Code from InformationRetrieval (SPPU - Final Year - Computer Engineering - Content) repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/InformationRetrieval/\n",
|
||||||
|
"\n",
|
||||||
|
"---"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "f9085aa3-6fc3-432c-8a96-5e6dcb89a900",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import libraries\n",
|
||||||
|
"import nltk\n",
|
||||||
|
"from nltk.corpus import stopwords\n",
|
||||||
|
"from nltk.tokenize import word_tokenize"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "81c78019-0857-4e4a-8235-8d2db97de214",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[nltk_data] Downloading package punkt to /home/nonroot/nltk_data...\n",
|
||||||
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
||||||
|
"[nltk_data] Downloading package stopwords to\n",
|
||||||
|
"[nltk_data] /home/nonroot/nltk_data...\n",
|
||||||
|
"[nltk_data] Package stopwords is already up-to-date!\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Download NLTK Resources\n",
|
||||||
|
"nltk.download('punkt')\n",
|
||||||
|
"nltk.download('stopwords')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "58c711bf-c052-4314-8103-5f6ce43d41c0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Stop word removal function\n",
|
||||||
|
"def remove_stop_words(text):\n",
|
||||||
|
" # Tokenizing the text into words\n",
|
||||||
|
" words = word_tokenize(text)\n",
|
||||||
|
" \n",
|
||||||
|
" # Defining the English stop words\n",
|
||||||
|
" stop_words = set(stopwords.words('english'))\n",
|
||||||
|
" \n",
|
||||||
|
" # Removing stop words from the text\n",
|
||||||
|
" filtered_words = [word for word in words if word.lower() not in stop_words]\n",
|
||||||
|
" \n",
|
||||||
|
" return ' '.join(filtered_words)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "fb409348-1737-48ac-baad-7a9024914b57",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Original Text:\n",
|
||||||
|
"This is an example of a text document that needs stop word removal\n",
|
||||||
|
"\n",
|
||||||
|
"Preprocessed Text:\n",
|
||||||
|
"example text document needs stop word removal\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Main function\n",
|
||||||
|
"if __name__ == \"__main__\":\n",
|
||||||
|
" input_text = \"This is an example of a text document that needs stop word removal\"\n",
|
||||||
|
" preprocessed_text = remove_stop_words(input_text)\n",
|
||||||
|
" print(\"Original Text:\")\n",
|
||||||
|
" print(input_text)\n",
|
||||||
|
" print(\"\\nPreprocessed Text:\")\n",
|
||||||
|
" print(preprocessed_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "54566bef-20a0-494b-9299-500417834bfd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.20"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
@@ -12,6 +12,18 @@ This repository contains essential resources for the Information Retrieval cours
|
|||||||
|
|
||||||
### Codes
|
### Codes
|
||||||
|
|
||||||
|
1. [Code-1.1 (Document Similarity)](Codes/Code-1.1.py)
|
||||||
|
2. [Code-1.2 (Page Ranking Algorithm)](Codes/Code-1.2.py)
|
||||||
|
3. [Code-1.3 (Stopword Removal)](Codes/Code-1.3.py)
|
||||||
|
4. [Code-1.4 (Hadoop)](Codes/Code-1.4/)
|
||||||
|
5. [Code-1.5 (Simple Web Crawler)](Codes/Code-1.5.py)
|
||||||
|
|
||||||
|
### Notebooks
|
||||||
|
|
||||||
|
1. [Code-1.1 (Document Similarity)](Notebooks/Code-1.1/)
|
||||||
|
2. [Code-1.2 (Page Rank Algorithm)](Notebooks/Code-1.2%20%28Page%20Rank%20Algorithm%29.ipynb)
|
||||||
|
3. [Code-1.3 (Stopword Removal)](Notebooks/Code-1.3%20%28Stopword%20Removal%29.ipynb)
|
||||||
|
|
||||||
### Practical
|
### Practical
|
||||||
|
|
||||||
1. [Practical-1.1](Practical/Practical-1.1/)
|
1. [Practical-1.1](Practical/Practical-1.1/)
|
||||||
@@ -25,6 +37,7 @@ This repository contains essential resources for the Information Retrieval cours
|
|||||||
- [END-SEM](Question%20Papers/END-SEM)
|
- [END-SEM](Question%20Papers/END-SEM)
|
||||||
|
|
||||||
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers)
|
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers)
|
||||||
|
### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user