sitemap_generator.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor import xml.etree.ElementTree as ET class SitemapSpider(CrawlSpider): name = 'sitemap_spider' allowed_domains = ['geeksforgeeks.org'] # Replace with the target site start_urls = ['https://www.geeksforgeeks.org/'] # Replace with the target URL rules = ( Rule(LinkExtractor(), callback='parse_item', follow=True), ) def parse_item(self, response): url = response.url self.save_url(url) def save_url(self, url): # Create or append to the sitemap.xml file sitemap_file = 'sitemap.xml' try: tree = ET.parse(sitemap_file) root = tree.getroot() except FileNotFoundError: root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9") tree = ET.ElementTree(root) url_elem = ET.SubElement(root, "url") loc = ET.SubElement(url_elem, "loc") loc.text = url tree.write(sitemap_file, encoding="UTF-8", xml_declaration=True) print(f"URL saved: {url}") |