InformationRetrieval/Codes/Code-1.5.py

# pip install requests beautifulsoup4

import requests
from bs4 import BeautifulSoup
import time

def crawl(url, depth):
    if depth == 0:
        return
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
    except requests.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    print(f"Crawling: {url}")

    # Find all links in the HTML
    links = set()
    for link in soup.find_all('a', href=True):
        full_url = link['href']
        if full_url.startswith('http'):
            links.add(full_url)

    # Recursively crawl each link
    for link in links:
        time.sleep(1)  # Be polite and avoid overwhelming the server
        crawl(link, depth - 1)

if __name__ == "__main__":
    start_url = input("Enter the URL to crawl: ")
    crawl_depth = int(input("Enter the crawl depth: "))
    crawl(start_url, crawl_depth)