Python 3 Tkinter GUI Script to Build a Crawler to Scrape all Links or URLs From Website Using BeautifulSoup4 Library Full Project For Beginners

Python 3 Tkinter GUI Script to Build a Crawler to Scrape all Links or URLs From Website Using BeautifulSoup4 Library Full Project For Beginners

 

Welcome folks today in this blog post we will be making a website crawler which scrapes all links from website in tkinter and beautifulsoup4 library. All the source code of the application will be given below.

 

 

 

Get Started

 

 

 

In order to get started we need to install the following libraries using the pip command as shown below

 

pip install tkinter

 

pip install bs4

 

After installing these libraries just make an app.py file and copy paste the following code

 

app.py

 

 

import tkinter as tk
import threading

import Crawl


class Application(tk.Frame):
    def __init__(self, master=None):
        super().__init__(master)
        self.pack()
        self.create_widgets()

    def create_widgets(self):
        self.insert = tk.Entry(self)
        self.insert.pack(side="top")
        self.insert.focus_set()

        self.hi_there = tk.Button(self)
        self.hi_there["text"] = "Click Here"
        self.hi_there["command"] = self.say_hi
        self.hi_there.pack(side="top")

        self.quit = tk.Button(
            self, text="QUIT", fg="red", command=root.destroy
        )

        self.quit.pack(side="bottom")

    def say_hi(self):
        print("Executed")
        process = threading.Thread(
            target=Crawl.Main, args=(self.insert.get(),)
        )
        process.setDaemon(True)
        process.start()


root = tk.Tk()
app = Application(master=root)
app.mainloop()

 

 

Now make a Crawl.py file inside the root directory and copy paste the following code

 

Crawl.py

 

 

import requests
from bs4 import BeautifulSoup


def Main(url_1):
    i = 1

    def Crawl(url_l, i):
        url = url_l
        source_code = requests.get(url)
        soup = BeautifulSoup(source_code.text, "lxml")

        with open("Data" + str(i) + ".txt", 'w', encoding='utf-8') as file:
            list_of_tags = set()
            for tag in soup.find_all():
                list_of_tags.add(tag.name)

            for tag in list_of_tags:
                file.write("-- " + tag.upper() + ' --\n')
                file.write("------------------\n")
                for link in soup.findAll(tag):
                    if link.string is not None:
                        file.write(link.string + '\n')
                file.write("------------------\n\n")

    def WriteLinks(url_l):
        url = url_l
        source_code = requests.get(url)
        soup = BeautifulSoup(source_code.text, "lxml")

        with open("Hyperlinks.txt", 'a') as file2:
            # file2.write("\n")
            for tag in soup.findAll('a'):
                if tag.get("href") is not None:
                    if tag.get("href").startswith("http"):
                        string = tag.get("href")
                    else:
                        string = \
                            lis[0].split('\n')[0].rstrip('/') + tag.get("href")

                    if string + "\n" not in set_of_urls:
                        set_of_urls.add(string + "\n")
                        file2.write(string + "\n")

    with open("Hyperlinks.txt", 'r') as file2:
        lis = file2.readlines()
        try:
            if lis[0].split('\n')[0] == url_1:
                pass
            else:
                lis.clear()
                lis.append(url_1 + '\n')
                with open("Hyperlinks.txt", 'w') as file_new:
                    file_new.write(url_1 + '\n')
        except IndexError:
            lis.append(url_1 + '\n')
            with open("Hyperlinks.txt", 'w') as file_new:
                file_new.write(url_1 + '\n')

        set_of_urls = set(lis)

        f = open("prof.txt", 'r')
        list_of_words = f.readlines()
        f.close()

        # print(set_of_urls)
        for line in lis:
            print(line.split('\n')[0] + "\n")
            try:
                # Crawl(line.split('\n')[0], i)
                i = i + 1
                WriteLinks(line.split('\n')[0])
                for line1 in list_of_words:
                    if line.split('\n')[0].find(line1.split('\n')[0]) != -1:
                        with open("Refined_links.txt",
                                  "a",
                                  encoding='utf-8') as writefile:

                            line_split = line.split('\n')[0]
                            line1_split = line1.split('\n')[0]
                            writefile.write(f'{line_split}\t{line1_split}\n')

                        break
            except Exception as e:
                print(e)

 

READ  Python 3 Tkinter Popup Scientific Calculator Using Math Module GUI Desktop App Full Project For Beginners

 

Now if you execute the python script app.py by typing the below command

 

python app.py

 

 

 

 

 

 

 

Leave a Reply