Added codes 1.1, 1.2, 1.3 and 1.5
This commit is contained in:
@@ -0,0 +1,35 @@
|
||||
# pip install requests beautifulsoup4
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
|
||||
def crawl(url, depth):
|
||||
if depth == 0:
|
||||
return
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status() # Check for HTTP errors
|
||||
except requests.RequestException as e:
|
||||
print(f"Failed to retrieve {url}: {e}")
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
print(f"Crawling: {url}")
|
||||
|
||||
# Find all links in the HTML
|
||||
links = set()
|
||||
for link in soup.find_all('a', href=True):
|
||||
full_url = link['href']
|
||||
if full_url.startswith('http'):
|
||||
links.add(full_url)
|
||||
|
||||
# Recursively crawl each link
|
||||
for link in links:
|
||||
time.sleep(1) # Be polite and avoid overwhelming the server
|
||||
crawl(link, depth - 1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_url = input("Enter the URL to crawl: ")
|
||||
crawl_depth = int(input("Enter the crawl depth: "))
|
||||
crawl(start_url, crawl_depth)
|
||||
Reference in New Issue
Block a user