diff --git a/lib/utils/extract.js b/lib/utils/extract.js index eaa7e93..5d931c0 100644 --- a/lib/utils/extract.js +++ b/lib/utils/extract.js @@ -1,236 +1,71 @@ -const fs = require('fs-extra'); +const sander = require('sander'); const app = require('express')(); -const compression = require('compression'); -const mkdirp = require('mkdirp'); +const cheerio = require('cheerio'); +const fetch = require('node-fetch'); const sapper = require('../index.js'); -const serve = require('serve-static'); -const Spider = require('node-spider'); -const path = require('path'); const { PORT = 3000, OUTPUT_DIR = 'dist' } = process.env; const { dest } = require('../config.js'); const prefix = `http://localhost:${PORT}`; -/** - * Returns the full URL of the specified path in the server. - * @param {string} url The path for which to get the complete URL. - * @return {string} The full URL. - */ -function getFullUrl(url) { - if (url.startsWith(prefix)) return url; - return `${prefix}${url}`; -} +module.exports = function() { + // Prep output directory + sander.rimrafSync(OUTPUT_DIR); -/** - * Returns the extension on the URL or '' if there is none. - * @param {string} url The URL. - * @return {string} The URL's extension or the empty string if the URL has no - * extension. - */ -function getExtension(url) { - const splits = url.split('.'); - let extension = splits[splits.length - 1].trim(); - if (!/^[a-zA-Z0-9]+$/.test(extension) || extension.length > 10) { - // Clear the extension if it is not alphanumeric or is long enough to - // signify it may just be a hash value or something. - extension = ''; - } - return extension; -} + sander.copydirSync('assets').to(OUTPUT_DIR); + sander.copydirSync(`${dest}/client`).to(`${OUTPUT_DIR}/client`); + sander.copyFileSync(`${dest}/service-worker.js`).to(`${OUTPUT_DIR}/service-worker.js`); -/** - * Returns the relative path for the specified URL, adding index.html if the URL - * ends in `/`. This makes the URL function well in a static site. - * @param {string} url The URL for which to retrieve the relative path. - * @return {string} A URL that starts with / that is relative to the server - * root. The URL will add index.html if it ends with `/`. - */ -function relativePath(url) { - if (url.startsWith(prefix)) return relativePath(url.substr(prefix.length)); - if (url.endsWith('/')) url += 'index.html'; - if (getExtension(url) == '') url += '/index.html'; - if (url.startsWith('/')) return url; - throw new Error('Bad url'); -} + // Intercept server route fetches + global.fetch = (url, opts) => { + if (url[0] === '/') { + const dest = OUTPUT_DIR + url; + url = `http://localhost:${PORT}${url}`; -/** - * Returns the Sapper API route for the specified URL path. - * @param {string} url The absolute or relative URL. - * @param {string=} apiPrefix The prefix for Sapper server-side routes. - * @return {string} The URL with /api/ in front. - */ -function apiPath(url, apiPrefix = '/api') { - if (url.startsWith(prefix)) { - return `${prefix}${apiPrefix}${url.substr(prefix.length)}`; - } - return `${apiPrefix}${url}`; -} + return fetch(url, opts) + .then(r => { + r.clone().text().then(body => { + sander.writeFileSync(dest, body); + }); + return r; + }); + } -/** - * Returns whether the specified URL is on the server or an external link. - * @param {string} url The URL. - * @return {boolean} True if the URL is on the server. - */ -function filter(url) { - return url.startsWith('/') || url.startsWith(getFullUrl('/')); -} + return fetch(url, opts); + }; -/** - * Retrieves chunk files that are normally cached for offline use in the service - * worker. - * @return {!Array} - */ -function getChunkFiles() { - const clientInfo = - fs.readJsonSync(path.join(dest, 'stats.client.json')); - const chunkFiles = clientInfo.assets.map(chunk => `/client/${chunk.name}`); - return chunkFiles; -} + app.use(sapper()); + const server = app.listen(PORT); -/** - * Exports the Sapper app as a static website by starting at the root and - * crawling pages that are linked, extracting server and client routes, and - * copying assets. - * @param {?Array=} includeUrls If non-null, a set of additional URLs to - * scrape in the extraction. This should only be set if there are routes - * that cannot be reached from the root. - * @param {?Array=} excludeUrls If non-null, a set of URLs to avoid - * scraping in the extraction. - * @param {string=} apiPrefix The path in which all server-side Sapper routes - * are defined. The Sapper template application uses '/api' -- if you - * diverge from the template app structure, you will want to change this. If - * your server-side Sapper routes span multiple directories, you will have - * to specify each file manually with the `includeUrls` param. - * @param {number=} extractionDir The directory in which to place the extracted - * output. - */ -module.exports = function(includeUrls = null, excludeUrls = null, - apiPrefix = '/api', extractionDir = OUTPUT_DIR) { + const seen = new Set(); - // Clean the output directory and copy assets in. - fs.removeSync(extractionDir); - mkdirp.sync(extractionDir); - fs.copySync('assets', extractionDir); + function handle(pathname) { + if (pathname[0] !== '/') return; - // Set up the server. + if (seen.has(pathname)) return; + seen.add(pathname); - // this allows us to do e.g. `fetch('/api/blog')` on the server - const fetch = require('node-fetch'); - global.fetch = (url, opts) => { - if (url[0] === '/') url = `http://localhost:${PORT}${url}`; - return fetch(url, opts); - }; + return fetch(`${prefix}${pathname}`) + .then(r => r.text()) + .then(body => { + const dest = OUTPUT_DIR + pathname + '/index.html'; + sander.writeFileSync(dest, body); - app.use(compression({ threshold: 0 })); + const $ = cheerio.load(body); + const hrefs = []; - app.use(serve('assets')); + $('a[href]').each((i, $a) => { + hrefs.push($a.attribs.href); + }); - app.use(sapper()); + return hrefs.reduce((promise, href) => promise.then(() => handle(href)), Promise.resolve()); + }) + .catch(err => { + console.error(`Error rendering ${pathname}: ${err.message}`); + }); + } - // If exclude URLs are set, normalize them. - if (excludeUrls == null) excludeUrls = []; - excludeUrls = excludeUrls.map((url) => getFullUrl(url)); - - // The crux of the extraction, chaining the traditional server call with a web - // scraper. The program automatically exits after all the static pages have - // been scraped from the server that are accessible from the root page (`/`). - const extractedFiles = []; // keep track of extracted files. - - return new Promise((resolve, reject) => { - const server = app.listen(PORT, () => { - console.log(`listening on port ${PORT} and beginning extraction`); - - const spider = new Spider({ - concurrent: 5, - delay: 0, - logs: process.stderr, - allowDuplicates: false, - catchErrors: true, - addReferrer: false, - xhr: false, - keepAlive: false, - error: (err, url) => { - console.error(`ERROR ${err} at ${url}`); - reject(); - }, - // Called when there are no more requests - done: () => { - server.close(() => { - console.log('Done!'); - resolve(); - }); - }, - - headers: { 'user-agent': 'node-spider' }, - // Use a binary encoding to preserve image files. - encoding: 'binary' - }); - - // The primary logic to handle a scraped page. - const handleRequest = (doc) => { - // Only deal with the page if it is on the server, i.e. it is not an - // external link. - if (!filter(doc.url)) return; - // Skip URL if it is in the exclude list. - if (excludeUrls.includes(getFullUrl(doc.url))) return; - - // Grab the page's relative path and write the page contents to a local - // file. - const relPath = relativePath(doc.url); - extractedFiles.push(relPath); - console.log(`GOT ${relPath}`); // static page url - fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body, - {encoding: 'binary'}); - - /** - * Resolves and checks if a given URL is local; if so, adds it to the - * scraping queue. - * @param {string} url The URL to process. - */ - const process = (url) => { - // Remove trailing hash if relevant. - url = url.split('#')[0]; - // Resolve URL relative to server root. - url = doc.resolve(url); - // Crawl more if the URL is on the server. - if (filter(url)) spider.queue(url, handleRequest); - }; - - const extension = getExtension(relPath); - if (extension == 'html') { - // Grab src and href attributes from html pages. - doc.$('[src]').each((i, elem) => { - process(doc.$(elem).attr('src')); - }); - doc.$('[href]').each((i, elem) => { - process(doc.$(elem).attr('href')); - }); - } - - if (doc.url.endsWith('/service-worker.js')) { - // Grab additional routes. - const chunkFiles = getChunkFiles(); - chunkFiles.forEach( - (url) => spider.queue(getFullUrl(url), handleRequest)); - } - - if (relPath.endsWith('/index.html') && - !relPath.startsWith(`${apiPrefix}/`)) { - // Attempt to grab the server-side route corresponding to a page that - // seems to be a basic route. - spider.queue(apiPath(doc.url, apiPrefix), handleRequest); - } - }; - - // Start crawling with the document root and the service worker. - spider.queue(getFullUrl('/'), handleRequest); - spider.queue(getFullUrl('/service-worker.js'), handleRequest); - - if (includeUrls !== null) { - includeUrls.forEach( - (url) => spider.queue(getFullUrl(url), handleRequest)); - } - }); - }); + return handle('/') // TODO all static routes + .then(() => server.close()); };