Welcome folks today in this blog post we will be building a online youtube comments scraper in node.js and puppeteer
.
Live Demo
You can see the live demo of web application here
Features of Youtube Comments Scraper
- It scrapes all the
youtube video comments
in no time
The efficiency of
scraper is quite good
It saves all the youtube comments in a text file
and downloads it
Get Started
Now to get started you need to create a empty directory and create a node.js project
by initializing this command
npm init -y
Then install all these node.js modules
npm i express
npm i ejs
npm i nodemon
npm i puppeteer
After installing all these modules make a index.js file inside your root directory
and copy paste the following code
index.js
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
const puppeteer = require('puppeteer'); const fs = require('fs'); const express = require('express') const path = require('path') const bodyparser = require('body-parser') const app = express() app.use(bodyparser.json()) app.use(bodyparser.urlencoded({extended:false})) app.set("view engine","ejs") const PORT = 5000 app.get('/youtubecommentsscraper',(req,res) => { res.render('youtubecommentsscraper',{title:"Youtube Comments Scraper"}) }) app.get("/download", (req, res) => { var pathoutput = req.query.path; console.log(pathoutput); var fullpath = path.join(__dirname, pathoutput); res.download(fullpath, (err) => { if (err) { fs.unlinkSync(fullpath); res.send(err); } fs.unlinkSync(fullpath); }); }); app.post('/getcomments',async(req,res) => { var url = req.body.url const comments = []; const browser = await puppeteer.launch({ headless: true}); const page = await browser.newPage(); process.on("unhandledRejection", (reason, p) => { console.error("Unhandled Rejection at: Promise", p, "reason:", reason); var formattedComments = comments.join("") fs.writeFileSync('comments.txt', formattedComments.trim()); browser.close(); }); await page.setViewport({ width: 1280, height: 800 }); const navigationPromise = page.waitForNavigation(); await page.goto(url,{ waitUntil: 'load', // Remove the timeout timeout: 0 }); await page.waitForSelector('h1.title'); async function getElText(page, selector) { return await page.evaluate((selector) => { return document.querySelector(selector).innerText }, selector); } const filterNum = (str) => { const numericalChar = new Set([ ".",",","0","1","2","3","4","5","6","7","8","9" ]); str = str.split("").filter(char => numericalChar.has(char)).join(""); return str; } await page.evaluate(_ => { window.scrollBy(0, window.innerHeight); }); await page.waitFor(2000); await page.waitForSelector('#comments'); const commentSelector = "#count > yt-formatted-string" await page.waitForSelector(commentSelector) const noOfComments = await getElText(page,commentSelector) console.log(noOfComments) var correctComments = filterNum(noOfComments) while (correctComments.search(",") >= 0) { correctComments = (correctComments + "").replace(',', ''); } console.log(correctComments) await navigationPromise; // Write your code here const distance = 800; // should be less than or equal to window.innerHeight const delay = 2; for (let i = 1; i < correctComments; i++) { console.log(i); const authorSelector = `.style-scope:nth-child(${i}) > #comment > #body > #main > #header > #header-author > #author-text > .style-scope`; console.log(authorSelector); const commentSelector = `.style-scope:nth-child(${i}) > #comment > #body > #main > #expander #content-text`; try{ await page.waitForSelector(commentSelector) }catch(error){ break } try{ await page.waitForSelector(authorSelector) }catch(error){ break; } const commentText = await getElText(page, commentSelector); const stripped =commentText.replace(/^\s+|\s+$/gm,''); const author = await getElText(page, authorSelector); console.log(commentText) console.log(author) await page.evaluate((y) => { document.scrollingElement.scrollBy(0, y); }, distance); await page.waitFor(delay); if (commentText) { // write each comment to DB or file // or batch the for processing later console.log(`${author}: ${commentText}`); comments.push(author + ":" + stripped +"\n\n"); }else{ break; } } outputFilePath = Date.now() + "comments.txt" var formattedComments = comments.join("") fs.writeFileSync(outputFilePath, formattedComments); await browser.close(); res.json({ path:outputFilePath }) }) app.listen(PORT, () => { console.log(`App is listening on Port ${PORT}`); }); |
Now make a views
folder inside the root directory and inside it make a youtubecommentsscraper.ejs
file and copy paste the following code
views/youtubecommentsscraper.ejs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
<!DOCTYPE html> <html> <head> <title><%=title%></title> <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" /> </head> <body> <div class="container"> <h1 class="text-center"> Youtube Comments Scraper </h1> <form id="form"> <div class="form-group"> <label for="json">Youtube URL:</label> <input class="form-control" type="text" name="file" id="url" placeholder="Enter Youtube URL" required> </div> <div class="form-group"> <button id="button" class="btn btn-block btn-danger"> Get Comments </button> </div> </form> <br><br> </div> </body> <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script> <script> var url="" $("#form").submit(function(e){ e.preventDefault() $("#button").text("Getting Comments Please Wait") $("#button").prop("disabled","true") url = $("#url").val() $.ajax({ method:"POST", url:"/getcomments", data:{url:url}, success:function(data){ console.log(data.path) window.open('/download?path='+data.path) $("#button").text("Get Comments") $("#button").removeAttr("disabled") location.reload(); } }) }) $("#url").change(function(e){ url = $(this).val() }) </script> </html> |
Screeshot
DOWNLOAD SOURCE CODE