Added codes 1.1, 1.2, 1.3 and 1.5

2025-10-12 22:51:57 +05:30
parent 0ac0a2859b
commit 4c84c01a65
4 changed files with 188 additions and 0 deletions
@@ -0,0 +1,66 @@
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+import numpy as np
+import nltk
+
+# Download necessary NLTK data
+nltk.download("punkt")
+nltk.download("stopwords")
+
+def process(file):
+    # Read the file
+    raw = open(file).read()
+    
+    # Tokenize the raw text
+    tokens = word_tokenize(raw)
+    words = [w.lower() for w in tokens]
+    
+    # Stem the tokens
+    porter = nltk.PorterStemmer()
+    stemmed_tokens = [porter.stem(t) for t in words]
+
+    # Removing stop words
+    stop_words = set(stopwords.words('english'))
+    filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]
+    
+    # Count words
+    count = nltk.defaultdict(int)
+    for word in filtered_tokens:
+        count[word] += 1
+        
+    return count
+
+def cos_sim(a, b):
+    dot_product = np.dot(a, b)
+    norm_a = np.linalg.norm(a)
+    norm_b = np.linalg.norm(b)
+    
+    return dot_product / (norm_a * norm_b)
+
+def getSimilarity(dict1, dict2):
+    all_words_list = []
+    
+    # Collect all unique words from both dictionaries
+    for key in dict1:
+        all_words_list.append(key)
+        
+    for key in dict2:
+        all_words_list.append(key)
+        
+    all_words_list_size = len(all_words_list)
+    v1 = np.zeros(all_words_list_size, dtype=np.int)
+    v2 = np.zeros(all_words_list_size, dtype=np.int)
+    
+    # Create vectors for the dictionaries
+    for i, key in enumerate(all_words_list):
+        v1[i] = dict1.get(key, 0)
+        v2[i] = dict2.get(key, 0)
+        
+    return cos_sim(v1, v2)
+
+if __name__ == '__main__':
+    dict1 = process("text1.txt")
+    dict2 = process("text2.txt")
+    
+    print("Similarity between two text documents:", getSimilarity(dict1, dict2))
+