const fs = require('fs-extra'); const app = require('express')(); const compression = require('compression'); const sapper = require('../index.js'); const static = require('serve-static'); const Spider = require('node-spider'); const path = require('path'); const { PORT = 3000, OUTPUT_DIR = 'dist' } = process.env; const { dest = sapperDest } = require('../config.js'); const prefix = `http://localhost:${PORT}`; /** * Returns the full URL of the specified path in the server. * @param {string} url The path for which to get the complete URL. * @return {string} The full URL. */ function getFullUrl(url) { if (url.startsWith(prefix)) return url; return `${prefix}${url}`; } /** * Returns the extension on the URL or '' if there is none. * @param {string} url The URL. * @return {string} The URL's extension or the empty string if the URL has no * extension. */ function getExtension(url) { const splits = url.split('.'); let extension = splits[splits.length - 1].trim(); if (!/^[a-zA-Z0-9]+$/.test(extension) || extension.length > 10) { // Clear the extension if it is not alphanumeric or is long enough to // signify it may just be a hash value or something. extension = ''; } return extension; } /** * Returns the relative path for the specified URL, adding index.html if the URL * ends in `/`. This makes the URL function well in a static site. * @param {string} url The URL for which to retrieve the relative path. * @return {string} A URL that starts with / that is relative to the server * root. The URL will add index.html if it ends with `/`. */ function relativePath(url) { if (url.startsWith(prefix)) return relativePath(url.substr(prefix.length)); if (url.endsWith('/')) url += 'index.html'; if (getExtension(url) == '') url += '/index.html'; if (url.startsWith('/')) return url; throw new Error('Bad url'); } /** * Returns the Sapper API route for the specified URL path. * @param {string} url The absolute or relative URL. * @return {string} The URL with /api/ in front. */ function apiPath(url) { if (url.startsWith(prefix)) { return `${prefix}/api${url.substr(prefix.length)}`; } return `/api${url}`; } /** * Returns whether the specified URL is on the server or an external link. * @param {string} url The URL. * @return {boolean} True if the URL is on the server. */ function filter(url) { return url.startsWith('/') || url.startsWith(getFullUrl('/')); } /** * Retrieves chunk files that are normally cached for offline use in the service * worker. * @return {!Array} */ function getChunkFiles() { const clientInfo = fs.readJsonSync(path.join(sapperDest, 'stats.client.json')); const chunkFiles = clientInfo.assets.map(chunk => `/client/${chunk.name}`); return chunkFiles; } /** * Exports the Sapper app as a static website by starting at the root and * crawling pages that are linked, their /api/ pages, and webpack routes, as * well as copying assets. * @param {?Array=} includeUrls If non-null, a set of additional URLs to * scrape in the extraction. This should only be set if there are routes * that cannot be reached from the root. * @param {?Array=} excludeUrls If non-null, a set of URLs to avoid * scraping in the extraction. * @param {number=} extractionDir The directory in which to place the extracted * output. */ module.exports = async function(includeUrls = null, excludeUrls = null, extractionDir = OUTPUT_DIR) { // Set up the server. // this allows us to do e.g. `fetch('/api/blog')` on the server const fetch = require('node-fetch'); global.fetch = (url, opts) => { if (url[0] === '/') url = `http://localhost:${PORT}${url}`; return fetch(url, opts); }; app.use(compression({ threshold: 0 })); app.use(static('assets')); app.use(sapper()); // Clean the output directory and copy assets in. fs.removeSync(extractionDir); fs.copySync('assets', extractionDir); // If exclude URLs are set, normalize them. if (excludeUrls == null) excludeUrls = []; excludeUrls = excludeUrls.map((url) => getFullUrl(url)); // The crux of the extraction, chaining the traditional server call with a web // scraper. The program automatically exits after all the static pages have // been scraped from the server that are accessible from the root page (`/`). const extractedFiles = []; // keep track of extracted files. const server = await app.listen(PORT); console.log(`listening on port ${PORT} and beginning extraction`); return new Promise((resolve, reject) => { const spider = new Spider({ concurrent: 5, delay: 0, logs: process.stderr, allowDuplicates: false, catchErrors: true, addReferrer: false, xhr: false, keepAlive: false, error: (err, url) => { console.error(`ERROR ${err} at ${url}`); reject(); }, // Called when there are no more requests done: async () => { await server.close(); console.log('Done!'); resolve(); }, headers: { 'user-agent': 'node-spider' }, // Use a binary encoding to preserve image files. encoding: 'binary' }); // The primary logic to handle a scraped page. const handleRequest = (doc) => { // Only deal with the page if it is on the server, i.e. it is not an // external link. if (!filter(doc.url)) return; // Skip URL if it is in the exclude list. if (excludeUrls.includes(getFullUrl(doc.url))) return; // Grab the page's relative path and write the page contents to a local // file. const relPath = relativePath(doc.url); extractedFiles.push(relPath); console.log(`GOT ${relPath}`); // static page url fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body, {encoding: 'binary'}); /** * Resolves and checks if a given URL is local; if so, adds it to the * scraping queue. * @param {string} url The URL to process. */ const process = (url) => { // Remove trailing hash if relevant. url = url.split('#')[0]; // Resolve URL relative to server root. url = doc.resolve(url); // Crawl more if the URL is on the server. if (filter(url)) spider.queue(url, handleRequest); }; const extension = getExtension(relPath); if (extension == 'html') { // Grab src and href attributes from html pages. doc.$('[src]').each((i, elem) => { process(doc.$(elem).attr('src')); }); doc.$('[href]').each((i, elem) => { process(doc.$(elem).attr('href')); }); } if (doc.url.endsWith('/service-worker.js')) { // Grab additional routes. const chunkFiles = getChunkFiles(); chunkFiles.forEach( (url) => spider.queue(getFullUrl(url), handleRequest)); } if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) { // Attempt to grab the /api/ version of a page that seems to be a // basic route. spider.queue(apiPath(doc.url), handleRequest); } }; // Start crawling with the document root and the service worker. spider.queue(getFullUrl('/'), handleRequest); spider.queue(getFullUrl('/service-worker.js'), handleRequest); if (includeUrls !== null) { includeUrls.forEach( (url) => spider.queue(getFullUrl(url), handleRequest)); } }); }