Added codes 1.1, 1.2, 1.3 and 1.5
This commit is contained in:
@@ -0,0 +1,66 @@
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import word_tokenize
|
||||
import numpy as np
|
||||
import nltk
|
||||
|
||||
# Download necessary NLTK data
|
||||
nltk.download("punkt")
|
||||
nltk.download("stopwords")
|
||||
|
||||
def process(file):
|
||||
# Read the file
|
||||
raw = open(file).read()
|
||||
|
||||
# Tokenize the raw text
|
||||
tokens = word_tokenize(raw)
|
||||
words = [w.lower() for w in tokens]
|
||||
|
||||
# Stem the tokens
|
||||
porter = nltk.PorterStemmer()
|
||||
stemmed_tokens = [porter.stem(t) for t in words]
|
||||
|
||||
# Removing stop words
|
||||
stop_words = set(stopwords.words('english'))
|
||||
filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]
|
||||
|
||||
# Count words
|
||||
count = nltk.defaultdict(int)
|
||||
for word in filtered_tokens:
|
||||
count[word] += 1
|
||||
|
||||
return count
|
||||
|
||||
def cos_sim(a, b):
|
||||
dot_product = np.dot(a, b)
|
||||
norm_a = np.linalg.norm(a)
|
||||
norm_b = np.linalg.norm(b)
|
||||
|
||||
return dot_product / (norm_a * norm_b)
|
||||
|
||||
def getSimilarity(dict1, dict2):
|
||||
all_words_list = []
|
||||
|
||||
# Collect all unique words from both dictionaries
|
||||
for key in dict1:
|
||||
all_words_list.append(key)
|
||||
|
||||
for key in dict2:
|
||||
all_words_list.append(key)
|
||||
|
||||
all_words_list_size = len(all_words_list)
|
||||
v1 = np.zeros(all_words_list_size, dtype=np.int)
|
||||
v2 = np.zeros(all_words_list_size, dtype=np.int)
|
||||
|
||||
# Create vectors for the dictionaries
|
||||
for i, key in enumerate(all_words_list):
|
||||
v1[i] = dict1.get(key, 0)
|
||||
v2[i] = dict2.get(key, 0)
|
||||
|
||||
return cos_sim(v1, v2)
|
||||
|
||||
if __name__ == '__main__':
|
||||
dict1 = process("text1.txt")
|
||||
dict2 = process("text2.txt")
|
||||
|
||||
print("Similarity between two text documents:", getSimilarity(dict1, dict2))
|
||||
|
||||
Reference in New Issue
Block a user