Before running the program, you need to install the following Python libraries:
requests
: For sending HTTP requests to fetch the web pages.BeautifulSoup
from bs4
: For parsing and extracting data from HTML.xml.etree.ElementTree
(included in the Python standard library): For creating and handling XML files.urllib.parse
(also included in the standard library): For URL manipulation.Follow these steps to install the prerequisites and Python on a Linux system:
python3 --version
sudo apt update
sudo apt install python3 python3-pip
pip3 install requests beautifulsoup4
Follow these steps to install Python and the required libraries on Termux:
pkg install python
pip install requests beautifulsoup4
To create the Python script file on any system:
nano
, vim
, gedit
on Linux or any editor in Termux)..py
extension, for example, web_crawler.py
.import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from urllib.parse import urljoin, urlparse
import os
from datetime import datetime
import threading
import time
# Target site
BASE_URL = "https://www.miralishahidi.ir"
visited_urls = set()
urls_to_visit = [BASE_URL]
lock = threading.Lock()
# Path to save the file in the current directory
download_path = os.path.join(os.getcwd(), "sitemap.xml")
# Number of concurrent threads
NUM_THREADS = 5
def is_valid(url):
"""Checks if the URL is related to your site and does not end in a PDF file."""
parsed = urlparse(url)
return (parsed.netloc == urlparse(BASE_URL).netloc) and not url.lower().endswith(".pdf")
def crawl(url):
"""Crawl web pages and extract links."""
if url in visited_urls:
return
print(f"Crawling URL: {url}")
visited_urls.add(url)
try:
response = requests.get(url, timeout=5)
response.raise_for_status() # Check the response status
soup = BeautifulSoup(response.text, "html.parser")
with lock:
for link in soup.find_all("a"):
href = link.get("href")
if href:
full_url = urljoin(BASE_URL, href)
if is_valid(full_url) and full_url not in visited_urls:
urls_to_visit.append(full_url)
print(f"Successfully crawled: {url}")
except requests.exceptions.RequestException as e:
print(f"Failed to crawl {url}: {e}")
def worker():
"""Thread Worker function"""
while True:
with lock:
if not urls_to_visit:
break
url = urls_to_visit.pop(0)
crawl(url)
time.sleep(1) # Adding delay between requests
def generate_sitemap(urls):
"""Generate a sitemap file in XML format."""
try:
# Ensure the save path is accessible
os.makedirs(os.path.dirname(download_path), exist_ok=True)
print("Generating sitemap.xml...")
urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
for url in urls:
url_element = ET.Element("url")
loc_element = ET.Element("loc")
loc_element.text = url
lastmod_element = ET.Element("lastmod")
lastmod_element.text = datetime.now().strftime("%Y-%m-%d")
changefreq_element = ET.Element("changefreq")
changefreq_element.text = "weekly"
priority_element = ET.Element("priority")
priority_element.text = "0.8"
url_element.append(loc_element)
url_element.append(lastmod_element)
url_element.append(changefreq_element)
url_element.append(priority_element)
urlset.append(url_element)
tree = ET.ElementTree(urlset)
tree.write(download_path, encoding="utf-8", xml_declaration=True)
print(f"sitemap.xml generated successfully at {download_path}")
except IOError as e:
print(f"Failed to save sitemap.xml: {e}")
# Start crawling from the main site
print("Starting the crawl process with threading...")
threads = []
for _ in range(NUM_THREADS):
thread = threading.Thread(target=worker)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
# Generate the sitemap.xml file
print("Crawling finished. Generating sitemap...")
generate_sitemap(visited_urls)
To run the program:
python3 web_crawler.py
The script starts by defining the target URL and initializing the data structures needed for crawling and generating the sitemap. The program uses multiple threads to crawl the site concurrently. Once crawling is finished, it generates a sitemap.xml
file listing all the visited URLs.
After the program runs successfully, you’ll find a sitemap.xml
file in the same directory as your script. This file can be submitted to search engines like Google to help them index your site more efficiently.