Web Crawler and Sitemap Generator

1. Prerequisites

Before running the program, you need to install the following Python libraries:

Installation on Linux

Follow these steps to install the prerequisites and Python on a Linux system:

Installation on Termux (Android)

Follow these steps to install Python and the required libraries on Termux:

2. Creating the Script File

To create the Python script file on any system:

3. The Complete Program Code

import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from urllib.parse import urljoin, urlparse
import os
from datetime import datetime
import threading
import time

# Target site
BASE_URL = "https://www.miralishahidi.ir"
visited_urls = set()
urls_to_visit = [BASE_URL]
lock = threading.Lock()

# Path to save the file in the current directory
download_path = os.path.join(os.getcwd(), "sitemap.xml")

# Number of concurrent threads
NUM_THREADS = 5

def is_valid(url):
    """Checks if the URL is related to your site and does not end in a PDF file."""
    parsed = urlparse(url)
    return (parsed.netloc == urlparse(BASE_URL).netloc) and not url.lower().endswith(".pdf")

def crawl(url):
    """Crawl web pages and extract links."""
    if url in visited_urls:
        return
    
    print(f"Crawling URL: {url}")
    visited_urls.add(url)
    
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Check the response status
        soup = BeautifulSoup(response.text, "html.parser")
        
        with lock:
            for link in soup.find_all("a"):
                href = link.get("href")
                if href:
                    full_url = urljoin(BASE_URL, href)
                    if is_valid(full_url) and full_url not in visited_urls:
                        urls_to_visit.append(full_url)
        print(f"Successfully crawled: {url}")
    
    except requests.exceptions.RequestException as e:
        print(f"Failed to crawl {url}: {e}")

def worker():
    """Thread Worker function"""
    while True:
        with lock:
            if not urls_to_visit:
                break
            url = urls_to_visit.pop(0)
        crawl(url)
        time.sleep(1)  # Adding delay between requests

def generate_sitemap(urls):
    """Generate a sitemap file in XML format."""
    try:
        # Ensure the save path is accessible
        os.makedirs(os.path.dirname(download_path), exist_ok=True)
        print("Generating sitemap.xml...")
        
        urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
        
        for url in urls:
            url_element = ET.Element("url")
            
            loc_element = ET.Element("loc")
            loc_element.text = url
            
            lastmod_element = ET.Element("lastmod")
            lastmod_element.text = datetime.now().strftime("%Y-%m-%d")
            
            changefreq_element = ET.Element("changefreq")
            changefreq_element.text = "weekly"
            
            priority_element = ET.Element("priority")
            priority_element.text = "0.8"
            
            url_element.append(loc_element)
            url_element.append(lastmod_element)
            url_element.append(changefreq_element)
            url_element.append(priority_element)
            urlset.append(url_element)
        
        tree = ET.ElementTree(urlset)
        tree.write(download_path, encoding="utf-8", xml_declaration=True)
        print(f"sitemap.xml generated successfully at {download_path}")
    
    except IOError as e:
        print(f"Failed to save sitemap.xml: {e}")

# Start crawling from the main site
print("Starting the crawl process with threading...")

threads = []
for _ in range(NUM_THREADS):
    thread = threading.Thread(target=worker)
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()

# Generate the sitemap.xml file
print("Crawling finished. Generating sitemap...")
generate_sitemap(visited_urls)

4. Running the Program

To run the program:

5. How It Works

The script starts by defining the target URL and initializing the data structures needed for crawling and generating the sitemap. The program uses multiple threads to crawl the site concurrently. Once crawling is finished, it generates a sitemap.xml file listing all the visited URLs.

6. Output

After the program runs successfully, you’ll find a sitemap.xml file in the same directory as your script. This file can be submitted to search engines like Google to help them index your site more efficiently.