NodeJS 404 Checker Tutorial
Introduction
In this tutorial, we’ll be looking at how you can create a nice and simple 404
broken link checker in NodeJS using the axios
library as well as cheerio
and
is-relative-url
.
We will be taking advantage of the async and await keywords in this tutorial. You can find out more on them here: JavaScript async and await Tutorial
Let’s begin by creating a simple async
function that will retrieve all of the
links from a passed in URL
. We’ll call this getAllLinks(url)
. Within the
function body, we will attempt to retrieve the URL
passed in with a simple
HTTP GET
request.
We’ll use the await
keyword to succinctly set our result
variable as the
response of our GET
request. This saves us from having to do .then()
and
.catch()
when handling the Promise returned by the axios.get()
function
call.
const axios = require("axios");
const cheerio = require("cheerio");
async function getAllLinks(url) {
try {
let result = await axios.get(url);
$ = cheerio.load(result.data);
links = [];
$("a").each((i, link) => {
links.push(link);
});
return links;
} catch (err) {
console.log(err);
return err;
}
}
async function crawlPage() {
let list = await getAllLinks("https://tutorialedge.net");
console.log(list);
}
crawlPage();
We can then extend this further so that once it has retrieved this list of links
from our retrieved webpage, we then subsequently call another GET
HTTP request
to each of those links and print out the results:
async function crawlPage(url) {
let links = await getAllLinks(url);
for (let link of links) {
try {
let resp = {};
if (isRelativeUrl($(link).attr("href"))) {
resp = await axios.get(url + $(link).attr("href"));
} else {
resp = await axios.get($(link).attr("href"));
}
console.log(
"Valid Url: " +
$(link).attr("href") +
" returned status: " +
resp.status
);
} catch (err) {
console.log("Not a valid URL: " + $(link).attr("href"));
}
}
}
Full Source Code:
The full source code for this tutorial can be found below:
const axios = require("axios");
const cheerio = require("cheerio");
const isRelativeUrl = require("is-relative-url");
let brokenLinks = [];
async function getAllLinks(url) {
try {
let result = await axios.get(url);
$ = cheerio.load(result.data);
links = [];
$("a").each((i, link) => {
links.push(link);
});
return links;
} catch (err) {
console.log(err);
}
}
async function crawlPage(url) {
let links = await getAllLinks(url);
for (let link of links) {
try {
let resp = {};
if (isRelativeUrl($(link).attr("href"))) {
resp = await axios.get(url + $(link).attr("href"));
} else {
resp = await axios.get($(link).attr("href"));
}
console.log(
"Valid Url: " +
$(link).attr("href") +
" returned status: " +
resp.status
);
} catch (err) {
console.log("Not a valid URL: " + $(link).attr("href"));
}
}
}
crawlPage("http://localhost:1313/");
Conclusion
Hopefully, you found this tutorial useful, if you did then please feel free to let me know by tweeting me: @Elliot_F.