app.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import urllib.request from html_table_parser.parser import HTMLTableParser import pandas as pd import os # Function to get the HTML content from a URL def url_get_contents(url): req = urllib.request.Request(url=url) f = urllib.request.urlopen(req) return f.read() # Create output directory output_folder = "scraped_tables_excel" os.makedirs(output_folder, exist_ok=True) # URL to scrape url = 'https://manuals.gfi.com/en/exinda/help/content/exos/sql-access/sql-urls.htm' xhtml = url_get_contents(url).decode('utf-8') # Parse the tables p = HTMLTableParser() p.feed(xhtml) # Save each table as an Excel file for idx, table in enumerate(p.tables): df = pd.DataFrame(table) excel_filename = os.path.join(output_folder, f'table_{idx+1}.xlsx') df.to_excel(excel_filename, index=False) print(f"Saved: {excel_filename}") |