Welcome folks today in this blog post we will be making a website crawler
which scrapes all links from website in tkinter and beautifulsoup4 library. All the source code of the application will be given below.
Get Started
In order to get started we need to install the following libraries using the pip
command as shown below
pip install tkinter
pip install bs4
After installing these libraries just make an app.py
file and copy paste the following code
app.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import tkinter as tk import threading import Crawl class Application(tk.Frame): def __init__(self, master=None): super().__init__(master) self.pack() self.create_widgets() def create_widgets(self): self.insert = tk.Entry(self) self.insert.pack(side="top") self.insert.focus_set() self.hi_there = tk.Button(self) self.hi_there["text"] = "Click Here" self.hi_there["command"] = self.say_hi self.hi_there.pack(side="top") self.quit = tk.Button( self, text="QUIT", fg="red", command=root.destroy ) self.quit.pack(side="bottom") def say_hi(self): print("Executed") process = threading.Thread( target=Crawl.Main, args=(self.insert.get(),) ) process.setDaemon(True) process.start() root = tk.Tk() app = Application(master=root) app.mainloop() |
Now make a Crawl.py
file inside the root directory and copy paste the following code
Crawl.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import requests from bs4 import BeautifulSoup def Main(url_1): i = 1 def Crawl(url_l, i): url = url_l source_code = requests.get(url) soup = BeautifulSoup(source_code.text, "lxml") with open("Data" + str(i) + ".txt", 'w', encoding='utf-8') as file: list_of_tags = set() for tag in soup.find_all(): list_of_tags.add(tag.name) for tag in list_of_tags: file.write("-- " + tag.upper() + ' --\n') file.write("------------------\n") for link in soup.findAll(tag): if link.string is not None: file.write(link.string + '\n') file.write("------------------\n\n") def WriteLinks(url_l): url = url_l source_code = requests.get(url) soup = BeautifulSoup(source_code.text, "lxml") with open("Hyperlinks.txt", 'a') as file2: # file2.write("\n") for tag in soup.findAll('a'): if tag.get("href") is not None: if tag.get("href").startswith("http"): string = tag.get("href") else: string = \ lis[0].split('\n')[0].rstrip('/') + tag.get("href") if string + "\n" not in set_of_urls: set_of_urls.add(string + "\n") file2.write(string + "\n") with open("Hyperlinks.txt", 'r') as file2: lis = file2.readlines() try: if lis[0].split('\n')[0] == url_1: pass else: lis.clear() lis.append(url_1 + '\n') with open("Hyperlinks.txt", 'w') as file_new: file_new.write(url_1 + '\n') except IndexError: lis.append(url_1 + '\n') with open("Hyperlinks.txt", 'w') as file_new: file_new.write(url_1 + '\n') set_of_urls = set(lis) f = open("prof.txt", 'r') list_of_words = f.readlines() f.close() # print(set_of_urls) for line in lis: print(line.split('\n')[0] + "\n") try: # Crawl(line.split('\n')[0], i) i = i + 1 WriteLinks(line.split('\n')[0]) for line1 in list_of_words: if line.split('\n')[0].find(line1.split('\n')[0]) != -1: with open("Refined_links.txt", "a", encoding='utf-8') as writefile: line_split = line.split('\n')[0] line1_split = line1.split('\n')[0] writefile.write(f'{line_split}\t{line1_split}\n') break except Exception as e: print(e) |
Now if you execute the python
script app.py
by typing the below command
python app.py