36 lines
987 B
Python
36 lines
987 B
Python
# pip install requests beautifulsoup4
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import time
|
|
|
|
def crawl(url, depth):
|
|
if depth == 0:
|
|
return
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status() # Check for HTTP errors
|
|
except requests.RequestException as e:
|
|
print(f"Failed to retrieve {url}: {e}")
|
|
return
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
print(f"Crawling: {url}")
|
|
|
|
# Find all links in the HTML
|
|
links = set()
|
|
for link in soup.find_all('a', href=True):
|
|
full_url = link['href']
|
|
if full_url.startswith('http'):
|
|
links.add(full_url)
|
|
|
|
# Recursively crawl each link
|
|
for link in links:
|
|
time.sleep(1) # Be polite and avoid overwhelming the server
|
|
crawl(link, depth - 1)
|
|
|
|
if __name__ == "__main__":
|
|
start_url = input("Enter the URL to crawl: ")
|
|
crawl_depth = int(input("Enter the crawl depth: "))
|
|
crawl(start_url, crawl_depth)
|