From 4c84c01a65e4379fd5408795f47550e7ff277d6a Mon Sep 17 00:00:00 2001 From: Kshitij Date: Sun, 12 Oct 2025 22:51:57 +0530 Subject: [PATCH] Added codes 1.1, 1.2, 1.3 and 1.5 --- Codes/Code-1.1.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++ Codes/Code-1.2.py | 57 ++++++++++++++++++++++++++++++++++++++++ Codes/Code-1.3.py | 30 +++++++++++++++++++++ Codes/Code-1.5.py | 35 +++++++++++++++++++++++++ 4 files changed, 188 insertions(+) create mode 100644 Codes/Code-1.1.py create mode 100644 Codes/Code-1.2.py create mode 100644 Codes/Code-1.3.py create mode 100644 Codes/Code-1.5.py diff --git a/Codes/Code-1.1.py b/Codes/Code-1.1.py new file mode 100644 index 0000000..5a6b349 --- /dev/null +++ b/Codes/Code-1.1.py @@ -0,0 +1,66 @@ +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +import numpy as np +import nltk + +# Download necessary NLTK data +nltk.download("punkt") +nltk.download("stopwords") + +def process(file): + # Read the file + raw = open(file).read() + + # Tokenize the raw text + tokens = word_tokenize(raw) + words = [w.lower() for w in tokens] + + # Stem the tokens + porter = nltk.PorterStemmer() + stemmed_tokens = [porter.stem(t) for t in words] + + # Removing stop words + stop_words = set(stopwords.words('english')) + filtered_tokens = [w for w in stemmed_tokens if w not in stop_words] + + # Count words + count = nltk.defaultdict(int) + for word in filtered_tokens: + count[word] += 1 + + return count + +def cos_sim(a, b): + dot_product = np.dot(a, b) + norm_a = np.linalg.norm(a) + norm_b = np.linalg.norm(b) + + return dot_product / (norm_a * norm_b) + +def getSimilarity(dict1, dict2): + all_words_list = [] + + # Collect all unique words from both dictionaries + for key in dict1: + all_words_list.append(key) + + for key in dict2: + all_words_list.append(key) + + all_words_list_size = len(all_words_list) + v1 = np.zeros(all_words_list_size, dtype=np.int) + v2 = np.zeros(all_words_list_size, dtype=np.int) + + # Create vectors for the dictionaries + for i, key in enumerate(all_words_list): + v1[i] = dict1.get(key, 0) + v2[i] = dict2.get(key, 0) + + return cos_sim(v1, v2) + +if __name__ == '__main__': + dict1 = process("text1.txt") + dict2 = process("text2.txt") + + print("Similarity between two text documents:", getSimilarity(dict1, dict2)) + diff --git a/Codes/Code-1.2.py b/Codes/Code-1.2.py new file mode 100644 index 0000000..47ef90b --- /dev/null +++ b/Codes/Code-1.2.py @@ -0,0 +1,57 @@ +import numpy as np + +# Constants for PageRank +threshold = 1e-13 +beta = 0.85 + +# Spider Trap Network represented as adjacency matrix +A = [ + [0, 0, 1, 0], + [1, 0, 0, 0], + [1, 1, 0, 0], + [1, 1, 0, 1] +] + +# Convert adjacency matrix to a numpy array +arr = np.array(A, dtype=float) + +# Calculate summation of columns +s = [] +for i in range(len(A)): + s.append(np.sum(arr[:, i])) + +print("Summation of columns: ", s) + +# Create the column stochastic probability matrix, M +M = arr.copy() +for j in range(len(A)): + if s[j] != 0: # Prevent division by zero + M[:, j] = M[:, j] / s[j] + +print("Column stochastic probability matrix, M:") +print(M) + +# Initialize rank vector +r = (1.0 + np.zeros([len(M), 1])) / len(M) +print("Initial rank vector:") +print(r) + +# Calculate the uniform rank contribution +uniformR = (1.0 - beta) * r +r_prev = r.copy() + +# PageRank iterations +for i in range(1, 1001): + print("Iteration: ", i) + r = beta * np.matmul(M, r_prev) + uniformR + print("The rank vector: ") + print(r) + + diff = np.sum(abs(r - r_prev)) + if diff < threshold: + break + r_prev = r.copy() + +# Display the final rank vector +print("The final rank vector: ") +print(r[:, 0]) diff --git a/Codes/Code-1.3.py b/Codes/Code-1.3.py new file mode 100644 index 0000000..3bedc17 --- /dev/null +++ b/Codes/Code-1.3.py @@ -0,0 +1,30 @@ +# Import libraries +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize + +# Ensure you have the necessary NLTK resources downloaded +nltk.download('punkt') +nltk.download('stopwords') + +def remove_stop_words(text): + # Tokenizing the text into words + words = word_tokenize(text) + + # Defining the English stop words + stop_words = set(stopwords.words('english')) + + # Removing stop words from the text + filtered_words = [word for word in words if word.lower() not in stop_words] + + return ' '.join(filtered_words) + +# Example usage +if __name__ == "__main__": + input_text = "This is an example of a text document that needs stop word removal." + preprocessed_text = remove_stop_words(input_text) + print("Original Text:") + print(input_text) + print("\nPreprocessed Text:") + print(preprocessed_text) + diff --git a/Codes/Code-1.5.py b/Codes/Code-1.5.py new file mode 100644 index 0000000..e82fd3f --- /dev/null +++ b/Codes/Code-1.5.py @@ -0,0 +1,35 @@ +# pip install requests beautifulsoup4 + +import requests +from bs4 import BeautifulSoup +import time + +def crawl(url, depth): + if depth == 0: + return + try: + response = requests.get(url) + response.raise_for_status() # Check for HTTP errors + except requests.RequestException as e: + print(f"Failed to retrieve {url}: {e}") + return + + soup = BeautifulSoup(response.text, 'html.parser') + print(f"Crawling: {url}") + + # Find all links in the HTML + links = set() + for link in soup.find_all('a', href=True): + full_url = link['href'] + if full_url.startswith('http'): + links.add(full_url) + + # Recursively crawl each link + for link in links: + time.sleep(1) # Be polite and avoid overwhelming the server + crawl(link, depth - 1) + +if __name__ == "__main__": + start_url = input("Enter the URL to crawl: ") + crawl_depth = int(input("Enter the crawl depth: ")) + crawl(start_url, crawl_depth)