Plan Zero

Hello, there!

Flattr this

Spidering the web with CasperJS

Published

The CasperJS web spider in action For a project I've been working on I needed a simple spider which would, given a start URL, recursively collect all the URLs it could find.

In the past I've used the excellent PhantomJS headless webkit browser for automation, but writing complex navigation scenarios can be a bit long-winded. Enter CasperJS. Built on top of PhantomJS, it simplifies the process and provides some nice syntactic sugar to boot.

The spider I wrote grabs the first page, finds all of the links, then by pushing each URL onto a stack and shifting new URLs from the bottom, follows each link in the order in which it was found. Going recursive is key; the casper.open() method doesn't block, so without recursion there would be trouble.

The following code is the core spider, which should be easy to adapt for most purposes:

spider.js

// Set the start URL
var startUrl = 'http://spider.test/';

// URL variables
var visitedUrls = [], pendingUrls = [];

// Create instances
var casper = require('casper').create({ /*verbose: true, logLevel: 'debug'*/ });
var utils = require('utils')
var helpers = require('./helpers')

// Spider from the given URL
function spider(url) {

	// Add the URL to the visited stack
	visitedUrls.push(url);

	// Open the URL
	casper.open(url).then(function() {

		// Set the status style based on server status code
		var status = this.status().currentHTTPStatus;
		switch(status) {
			case 200: var statusStyle = { fg: 'green', bold: true }; break;
			case 404: var statusStyle = { fg: 'red', bold: true }; break;
			 default: var statusStyle = { fg: 'magenta', bold: true }; break;
		}

		// Display the spidered URL and status
		this.echo(this.colorizer.format(status, statusStyle) + ' ' + url);

		// Find links present on this page
		var links = this.evaluate(function() {
			var links = [];
			Array.prototype.forEach.call(__utils__.findAll('a'), function(e) {
				links.push(e.getAttribute('href'));
			});
			return links;
		});

		// Add newly found URLs to the stack
		var baseUrl = this.getGlobal('location').origin;
		Array.prototype.forEach.call(links, function(link) {
			var newUrl = helpers.absoluteUri(baseUrl, link);
			if (pendingUrls.indexOf(newUrl) == -1 && visitedUrls.indexOf(newUrl) == -1) {
				//casper.echo(casper.colorizer.format('-> Pushed ' + newUrl + ' onto the stack', { fg: 'magenta' }));
				pendingUrls.push(newUrl);
			}
		});

		// If there are URLs to be processed
		if (pendingUrls.length > 0) {
			var nextUrl = pendingUrls.shift();
			//this.echo(this.colorizer.format('<- Popped ' + nextUrl + ' from the stack', { fg: 'blue' }));
			spider(nextUrl);
		}

	});

}

// Start spidering
casper.start(startUrl, function() {
	spider(startUrl);
});

// Start the run
casper.run();

To make sure that relative URLs are resolved and duplicates are avoided, I created the following helper based on work by Yaffle. Place it in helpers.js and it'll be included by the above script.

helpers.js

// Turn a (possibly) relative URI into a full RFC 3986-compliant URI
// With minor modifications, courtesy: https://gist.github.com/Yaffle/1088850
function absoluteUri(base, href) {

	// Parse a URI and return its constituent parts
	function parseUri(url) {
		var match = String(url).replace(/^\s+|\s+$/g, '').match(/^([^:\/?#]+:)?(\/\/(?:[^:@]*(?::[^:@]*)?@)?(([^:\/?#]*)(?::(\d*))?))?([^?#]*)(\?[^#]*)?(#[\s\S]*)?/);
		return (match ? { href: match[0] || '', protocol: match[1] || '', authority: match[2] || '', host: match[3] || '', hostname: match[4] || '',
		                  port: match[5] || '', pathname: match[6] || '', search: match[7] || '', hash: match[8] || '' } : null);
	}

	// Resolve dots in the path
	function resolvePathDots(input) {
		var output = [];
		input.replace(/^(\.\.?(\/|$))+/, '')
		     .replace(/\/(\.(\/|$))+/g, '/')
		     .replace(/\/\.\.$/, '/../')
		     .replace(/\/?[^\/]*/g, function (part) { part === '/..' ? output.pop() : output.push(part); });
		return output.join('').replace(/^\//, input.charAt(0) === '/' ? '/' : '');
	}

	// Parse base and href 
	href = parseUri(href || '');
	base = parseUri(base || '');

	// Build and return the URI 
	return !href || !base ? null : (href.protocol || base.protocol) +
	       (href.protocol || href.authority ? href.authority : base.authority) +
	       (resolvePathDots(href.protocol || href.authority || href.pathname.charAt(0) === '/' ? href.pathname : (href.pathname ? ((base.authority && !base.pathname ? '/' : '') + base.pathname.slice(0, base.pathname.lastIndexOf('/') + 1) + href.pathname) : base.pathname))) +
	       (href.protocol || href.authority || href.pathname ? href.search : (href.search || base.search)) + href.hash;

}
exports.absoluteUri = absoluteUri;

That's it for the basic spider. Before using the code, it would be a good idea to add a domain filter so the spider doesn't start spidering the internet ;-) If you want to capture things like page resources, 301 and 302 redirects (like I did), you can add a resource.received listener and get the response code from resource.status.

Be responsible, and have fun!