Added codes 1.1, 1.2, 1.3 and 1.5

2025-10-12 22:51:57 +05:30
parent 0ac0a2859b
commit 4c84c01a65
4 changed files with 188 additions and 0 deletions
@@ -0,0 +1,66 @@
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import numpy as np
+import nltk
+
+# Download necessary NLTK data
+nltk.download("punkt")
+nltk.download("stopwords")
+
+def process(file):
+    # Read the file
+    raw = open(file).read()
+    
+    # Tokenize the raw text
+    tokens = word_tokenize(raw)
+    words = [w.lower() for w in tokens]
+    
+    # Stem the tokens
+    porter = nltk.PorterStemmer()
+    stemmed_tokens = [porter.stem(t) for t in words]
+
+    # Removing stop words
+    stop_words = set(stopwords.words('english'))
+    filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]
+    
+    # Count words
+    count = nltk.defaultdict(int)
+    for word in filtered_tokens:
+        count[word] += 1
+        
+    return count
+
+def cos_sim(a, b):
+    dot_product = np.dot(a, b)
+    norm_a = np.linalg.norm(a)
+    norm_b = np.linalg.norm(b)
+    
+    return dot_product / (norm_a * norm_b)
+
+def getSimilarity(dict1, dict2):
+    all_words_list = []
+    
+    # Collect all unique words from both dictionaries
+    for key in dict1:
+        all_words_list.append(key)
+        
+    for key in dict2:
+        all_words_list.append(key)
+        
+    all_words_list_size = len(all_words_list)
+    v1 = np.zeros(all_words_list_size, dtype=np.int)
+    v2 = np.zeros(all_words_list_size, dtype=np.int)
+    
+    # Create vectors for the dictionaries
+    for i, key in enumerate(all_words_list):
+        v1[i] = dict1.get(key, 0)
+        v2[i] = dict2.get(key, 0)
+        
+    return cos_sim(v1, v2)
+
+if __name__ == '__main__':
+    dict1 = process("text1.txt")
+    dict2 = process("text2.txt")
+    
+    print("Similarity between two text documents:", getSimilarity(dict1, dict2))
+
@@ -0,0 +1,57 @@
+import numpy as np
+
+# Constants for PageRank
+threshold = 1e-13
+beta = 0.85
+
+# Spider Trap Network represented as adjacency matrix
+A = [
+    [0, 0, 1, 0],
+    [1, 0, 0, 0],
+    [1, 1, 0, 0],
+    [1, 1, 0, 1]
+]
+
+# Convert adjacency matrix to a numpy array
+arr = np.array(A, dtype=float)
+
+# Calculate summation of columns
+s = []
+for i in range(len(A)):
+    s.append(np.sum(arr[:, i]))
+
+print("Summation of columns: ", s)
+
+# Create the column stochastic probability matrix, M
+M = arr.copy()
+for j in range(len(A)):
+    if s[j] != 0:  # Prevent division by zero
+        M[:, j] = M[:, j] / s[j]
+
+print("Column stochastic probability matrix, M:")
+print(M)
+
+# Initialize rank vector
+r = (1.0 + np.zeros([len(M), 1])) / len(M)
+print("Initial rank vector:")
+print(r)
+
+# Calculate the uniform rank contribution
+uniformR = (1.0 - beta) * r
+r_prev = r.copy()
+
+# PageRank iterations
+for i in range(1, 1001):
+    print("Iteration: ", i)
+    r = beta * np.matmul(M, r_prev) + uniformR
+    print("The rank vector: ")
+    print(r)
+
+    diff = np.sum(abs(r - r_prev))
+    if diff < threshold:
+        break
+    r_prev = r.copy()
+
+# Display the final rank vector
+print("The final rank vector: ")
+print(r[:, 0])
@@ -0,0 +1,30 @@
+# Import libraries
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+
+# Ensure you have the necessary NLTK resources downloaded
+nltk.download('punkt')
+nltk.download('stopwords')
+
+def remove_stop_words(text):
+    # Tokenizing the text into words
+    words = word_tokenize(text)
+    
+    # Defining the English stop words
+    stop_words = set(stopwords.words('english'))
+    
+    # Removing stop words from the text
+    filtered_words = [word for word in words if word.lower() not in stop_words]
+    
+    return ' '.join(filtered_words)
+
+# Example usage
+if __name__ == "__main__":
+    input_text = "This is an example of a text document that needs stop word removal."
+    preprocessed_text = remove_stop_words(input_text)
+    print("Original Text:")
+    print(input_text)
+    print("\nPreprocessed Text:")
+    print(preprocessed_text)
+
@@ -0,0 +1,35 @@
+# pip install requests beautifulsoup4
+
+import requests
+from bs4 import BeautifulSoup
+import time
+
+def crawl(url, depth):
+    if depth == 0:
+        return
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Check for HTTP errors
+    except requests.RequestException as e:
+        print(f"Failed to retrieve {url}: {e}")
+        return
+
+    soup = BeautifulSoup(response.text, 'html.parser')
+    print(f"Crawling: {url}")
+
+    # Find all links in the HTML
+    links = set()
+    for link in soup.find_all('a', href=True):
+        full_url = link['href']
+        if full_url.startswith('http'):
+            links.add(full_url)
+    
+    # Recursively crawl each link
+    for link in links:
+        time.sleep(1)  # Be polite and avoid overwhelming the server
+        crawl(link, depth - 1)
+
+if __name__ == "__main__":
+    start_url = input("Enter the URL to crawl: ")
+    crawl_depth = int(input("Enter the crawl depth: "))
+    crawl(start_url, crawl_depth)