Python 3 BeautifulSoup to Crawl Website & Generate XML Sitemap & Download it to Local Machine

app.py

import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from urllib.parse import urljoin,urlparse

# Function to get all links from the website
def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract all anchor tags
    links = set()
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']
        # Join relative links with the base URL
        full_url = urljoin(url, link)
        # Filter out non-http(s) URLs or other irrelevant links
        if full_url.startswith("http"):
            links.add(full_url)
    return links


def create_sitemap(url,filename="sitemap.xml"):
    urls = get_all_links(url)
    print(urls)

    urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")

    for link in urls:
        url_element = ET.SubElement(urlset,"url")
        loc = ET.SubElement(url_element,'loc')
        loc.text = link

    tree = ET.ElementTree(urlset)
    tree.write(filename,encoding='UTF-8',xml_declaration=True)

    print("XML SITEMAP GENERATED YOU CAN CHECK")

if __name__ == "__main__":
    site_url = "https://freemediatools.com"

    create_sitemap(site_url,"sitemap.xml")

import requests

from bs4 import BeautifulSoup

import xml.etree.ElementTree as ET

from urllib.parse import urljoin,urlparse

# Function to get all links from the website

def get_all_links(url):

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# Extract all anchor tags

links = set()

for a_tag in soup.find_all('a', href=True):

link = a_tag['href']

# Join relative links with the base URL

full_url = urljoin(url, link)

# Filter out non-http(s) URLs or other irrelevant links

if full_url.startswith("http"):

links.add(full_url)

return links

def create_sitemap(url,filename="sitemap.xml"):

urls = get_all_links(url)

print(urls)

urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")

for link in urls:

url_element = ET.SubElement(urlset,"url")

loc = ET.SubElement(url_element,'loc')

loc.text = link

tree = ET.ElementTree(urlset)

tree.write(filename,encoding='UTF-8',xml_declaration=True)

print("XML SITEMAP GENERATED YOU CAN CHECK")

if __name__ == "__main__":

site_url = "https://freemediatools.com"

create_sitemap(site_url,"sitemap.xml")

Python 3 BeautifulSoup to Crawl Website & Generate XML Sitemap & Download it to Local Machine

Comments

Leave a Reply Cancel reply

Archives