Added codes 1.1, 1.2, 1.3 and 1.5
This commit is contained in:
@@ -0,0 +1,66 @@
|
|||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import numpy as np
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
# Download necessary NLTK data
|
||||||
|
nltk.download("punkt")
|
||||||
|
nltk.download("stopwords")
|
||||||
|
|
||||||
|
def process(file):
|
||||||
|
# Read the file
|
||||||
|
raw = open(file).read()
|
||||||
|
|
||||||
|
# Tokenize the raw text
|
||||||
|
tokens = word_tokenize(raw)
|
||||||
|
words = [w.lower() for w in tokens]
|
||||||
|
|
||||||
|
# Stem the tokens
|
||||||
|
porter = nltk.PorterStemmer()
|
||||||
|
stemmed_tokens = [porter.stem(t) for t in words]
|
||||||
|
|
||||||
|
# Removing stop words
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
filtered_tokens = [w for w in stemmed_tokens if w not in stop_words]
|
||||||
|
|
||||||
|
# Count words
|
||||||
|
count = nltk.defaultdict(int)
|
||||||
|
for word in filtered_tokens:
|
||||||
|
count[word] += 1
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
def cos_sim(a, b):
|
||||||
|
dot_product = np.dot(a, b)
|
||||||
|
norm_a = np.linalg.norm(a)
|
||||||
|
norm_b = np.linalg.norm(b)
|
||||||
|
|
||||||
|
return dot_product / (norm_a * norm_b)
|
||||||
|
|
||||||
|
def getSimilarity(dict1, dict2):
|
||||||
|
all_words_list = []
|
||||||
|
|
||||||
|
# Collect all unique words from both dictionaries
|
||||||
|
for key in dict1:
|
||||||
|
all_words_list.append(key)
|
||||||
|
|
||||||
|
for key in dict2:
|
||||||
|
all_words_list.append(key)
|
||||||
|
|
||||||
|
all_words_list_size = len(all_words_list)
|
||||||
|
v1 = np.zeros(all_words_list_size, dtype=np.int)
|
||||||
|
v2 = np.zeros(all_words_list_size, dtype=np.int)
|
||||||
|
|
||||||
|
# Create vectors for the dictionaries
|
||||||
|
for i, key in enumerate(all_words_list):
|
||||||
|
v1[i] = dict1.get(key, 0)
|
||||||
|
v2[i] = dict2.get(key, 0)
|
||||||
|
|
||||||
|
return cos_sim(v1, v2)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
dict1 = process("text1.txt")
|
||||||
|
dict2 = process("text2.txt")
|
||||||
|
|
||||||
|
print("Similarity between two text documents:", getSimilarity(dict1, dict2))
|
||||||
|
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Constants for PageRank
|
||||||
|
threshold = 1e-13
|
||||||
|
beta = 0.85
|
||||||
|
|
||||||
|
# Spider Trap Network represented as adjacency matrix
|
||||||
|
A = [
|
||||||
|
[0, 0, 1, 0],
|
||||||
|
[1, 0, 0, 0],
|
||||||
|
[1, 1, 0, 0],
|
||||||
|
[1, 1, 0, 1]
|
||||||
|
]
|
||||||
|
|
||||||
|
# Convert adjacency matrix to a numpy array
|
||||||
|
arr = np.array(A, dtype=float)
|
||||||
|
|
||||||
|
# Calculate summation of columns
|
||||||
|
s = []
|
||||||
|
for i in range(len(A)):
|
||||||
|
s.append(np.sum(arr[:, i]))
|
||||||
|
|
||||||
|
print("Summation of columns: ", s)
|
||||||
|
|
||||||
|
# Create the column stochastic probability matrix, M
|
||||||
|
M = arr.copy()
|
||||||
|
for j in range(len(A)):
|
||||||
|
if s[j] != 0: # Prevent division by zero
|
||||||
|
M[:, j] = M[:, j] / s[j]
|
||||||
|
|
||||||
|
print("Column stochastic probability matrix, M:")
|
||||||
|
print(M)
|
||||||
|
|
||||||
|
# Initialize rank vector
|
||||||
|
r = (1.0 + np.zeros([len(M), 1])) / len(M)
|
||||||
|
print("Initial rank vector:")
|
||||||
|
print(r)
|
||||||
|
|
||||||
|
# Calculate the uniform rank contribution
|
||||||
|
uniformR = (1.0 - beta) * r
|
||||||
|
r_prev = r.copy()
|
||||||
|
|
||||||
|
# PageRank iterations
|
||||||
|
for i in range(1, 1001):
|
||||||
|
print("Iteration: ", i)
|
||||||
|
r = beta * np.matmul(M, r_prev) + uniformR
|
||||||
|
print("The rank vector: ")
|
||||||
|
print(r)
|
||||||
|
|
||||||
|
diff = np.sum(abs(r - r_prev))
|
||||||
|
if diff < threshold:
|
||||||
|
break
|
||||||
|
r_prev = r.copy()
|
||||||
|
|
||||||
|
# Display the final rank vector
|
||||||
|
print("The final rank vector: ")
|
||||||
|
print(r[:, 0])
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# Import libraries
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
# Ensure you have the necessary NLTK resources downloaded
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('stopwords')
|
||||||
|
|
||||||
|
def remove_stop_words(text):
|
||||||
|
# Tokenizing the text into words
|
||||||
|
words = word_tokenize(text)
|
||||||
|
|
||||||
|
# Defining the English stop words
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
|
||||||
|
# Removing stop words from the text
|
||||||
|
filtered_words = [word for word in words if word.lower() not in stop_words]
|
||||||
|
|
||||||
|
return ' '.join(filtered_words)
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
if __name__ == "__main__":
|
||||||
|
input_text = "This is an example of a text document that needs stop word removal."
|
||||||
|
preprocessed_text = remove_stop_words(input_text)
|
||||||
|
print("Original Text:")
|
||||||
|
print(input_text)
|
||||||
|
print("\nPreprocessed Text:")
|
||||||
|
print(preprocessed_text)
|
||||||
|
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
# pip install requests beautifulsoup4
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
|
||||||
|
def crawl(url, depth):
|
||||||
|
if depth == 0:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status() # Check for HTTP errors
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Failed to retrieve {url}: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
print(f"Crawling: {url}")
|
||||||
|
|
||||||
|
# Find all links in the HTML
|
||||||
|
links = set()
|
||||||
|
for link in soup.find_all('a', href=True):
|
||||||
|
full_url = link['href']
|
||||||
|
if full_url.startswith('http'):
|
||||||
|
links.add(full_url)
|
||||||
|
|
||||||
|
# Recursively crawl each link
|
||||||
|
for link in links:
|
||||||
|
time.sleep(1) # Be polite and avoid overwhelming the server
|
||||||
|
crawl(link, depth - 1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
start_url = input("Enter the URL to crawl: ")
|
||||||
|
crawl_depth = int(input("Enter the crawl depth: "))
|
||||||
|
crawl(start_url, crawl_depth)
|
||||||
Reference in New Issue
Block a user