diff --git a/lib/utils/extract.js b/lib/utils/extract.js index d77a39e..e41e252 100644 --- a/lib/utils/extract.js +++ b/lib/utils/extract.js @@ -98,7 +98,7 @@ function getChunkFiles() { * @param {number=} extractionDir The directory in which to place the extracted * output. */ -module.exports = async function(includeUrls = null, excludeUrls = null, +module.exports = function(includeUrls = null, excludeUrls = null, extractionDir = OUTPUT_DIR) { // Set up the server. @@ -127,97 +127,98 @@ module.exports = async function(includeUrls = null, excludeUrls = null, // scraper. The program automatically exits after all the static pages have // been scraped from the server that are accessible from the root page (`/`). const extractedFiles = []; // keep track of extracted files. - const server = await app.listen(PORT); - console.log(`listening on port ${PORT} and beginning extraction`); + const server = app.listen(PORT, () => { + console.log(`listening on port ${PORT} and beginning extraction`); + return new Promise((resolve, reject) => { + const spider = new Spider({ + concurrent: 5, + delay: 0, + logs: process.stderr, + allowDuplicates: false, + catchErrors: true, + addReferrer: false, + xhr: false, + keepAlive: false, + error: (err, url) => { + console.error(`ERROR ${err} at ${url}`); + reject(); + }, + // Called when there are no more requests + done: () => { + server.close(() => { + console.log('Done!'); + resolve(); + }); + }, - return new Promise((resolve, reject) => { - const spider = new Spider({ - concurrent: 5, - delay: 0, - logs: process.stderr, - allowDuplicates: false, - catchErrors: true, - addReferrer: false, - xhr: false, - keepAlive: false, - error: (err, url) => { - console.error(`ERROR ${err} at ${url}`); - reject(); - }, - // Called when there are no more requests - done: async () => { - await server.close(); - console.log('Done!'); - resolve(); - }, + headers: { 'user-agent': 'node-spider' }, + // Use a binary encoding to preserve image files. + encoding: 'binary' + }); - headers: { 'user-agent': 'node-spider' }, - // Use a binary encoding to preserve image files. - encoding: 'binary' - }); + // The primary logic to handle a scraped page. + const handleRequest = (doc) => { + // Only deal with the page if it is on the server, i.e. it is not an + // external link. + if (!filter(doc.url)) return; + // Skip URL if it is in the exclude list. + if (excludeUrls.includes(getFullUrl(doc.url))) return; - // The primary logic to handle a scraped page. - const handleRequest = (doc) => { - // Only deal with the page if it is on the server, i.e. it is not an - // external link. - if (!filter(doc.url)) return; - // Skip URL if it is in the exclude list. - if (excludeUrls.includes(getFullUrl(doc.url))) return; + // Grab the page's relative path and write the page contents to a local + // file. + const relPath = relativePath(doc.url); + extractedFiles.push(relPath); + console.log(`GOT ${relPath}`); // static page url + fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body, + {encoding: 'binary'}); - // Grab the page's relative path and write the page contents to a local - // file. - const relPath = relativePath(doc.url); - extractedFiles.push(relPath); - console.log(`GOT ${relPath}`); // static page url - fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body, - {encoding: 'binary'}); + /** + * Resolves and checks if a given URL is local; if so, adds it to the + * scraping queue. + * @param {string} url The URL to process. + */ + const process = (url) => { + // Remove trailing hash if relevant. + url = url.split('#')[0]; + // Resolve URL relative to server root. + url = doc.resolve(url); + // Crawl more if the URL is on the server. + if (filter(url)) spider.queue(url, handleRequest); + }; - /** - * Resolves and checks if a given URL is local; if so, adds it to the - * scraping queue. - * @param {string} url The URL to process. - */ - const process = (url) => { - // Remove trailing hash if relevant. - url = url.split('#')[0]; - // Resolve URL relative to server root. - url = doc.resolve(url); - // Crawl more if the URL is on the server. - if (filter(url)) spider.queue(url, handleRequest); + const extension = getExtension(relPath); + if (extension == 'html') { + // Grab src and href attributes from html pages. + doc.$('[src]').each((i, elem) => { + process(doc.$(elem).attr('src')); + }); + doc.$('[href]').each((i, elem) => { + process(doc.$(elem).attr('href')); + }); + } + + if (doc.url.endsWith('/service-worker.js')) { + // Grab additional routes. + const chunkFiles = getChunkFiles(); + chunkFiles.forEach( + (url) => spider.queue(getFullUrl(url), handleRequest)); + } + + if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) { + // Attempt to grab the /api/ version of a page that seems to be a + // basic route. + spider.queue(apiPath(doc.url), handleRequest); + } }; - const extension = getExtension(relPath); - if (extension == 'html') { - // Grab src and href attributes from html pages. - doc.$('[src]').each((i, elem) => { - process(doc.$(elem).attr('src')); - }); - doc.$('[href]').each((i, elem) => { - process(doc.$(elem).attr('href')); - }); - } + // Start crawling with the document root and the service worker. + spider.queue(getFullUrl('/'), handleRequest); + spider.queue(getFullUrl('/service-worker.js'), handleRequest); - if (doc.url.endsWith('/service-worker.js')) { - // Grab additional routes. - const chunkFiles = getChunkFiles(); - chunkFiles.forEach( + if (includeUrls !== null) { + includeUrls.forEach( (url) => spider.queue(getFullUrl(url), handleRequest)); } - - if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) { - // Attempt to grab the /api/ version of a page that seems to be a - // basic route. - spider.queue(apiPath(doc.url), handleRequest); - } - }; - - // Start crawling with the document root and the service worker. - spider.queue(getFullUrl('/'), handleRequest); - spider.queue(getFullUrl('/service-worker.js'), handleRequest); - - if (includeUrls !== null) { - includeUrls.forEach( - (url) => spider.queue(getFullUrl(url), handleRequest)); - } + }); }); } diff --git a/test/common/test.js b/test/common/test.js index c3e290d..1c6f332 100644 --- a/test/common/test.js +++ b/test/common/test.js @@ -322,6 +322,9 @@ function run(env) { 'about/index.html', 'api/about/index.html', + 'slow-preload/index.html', + 'api/slow-preload/index.html', + 'blog/index.html', 'api/blog/index.html', @@ -351,11 +354,33 @@ function run(env) { 'svelte-logo-192.png', 'svelte-logo-512.png', ]; + // Client scripts that should show up in the extraction directory. + const expectedClientRegexes = [ + /client\/_\..*?\.js/, + /client\/about\..*?\.js/, + /client\/blog_\$slug\$\..*?\.js/, + /client\/blog\..*?\.js/, + /client\/main\..*?\.js/, + /client\/show_url\..*?\.js/, + /client\/slow_preload\..*?\.js/, + ]; const allPages = walkSync(dest); expectedPages.forEach((expectedPage) => { assert.ok(allPages.includes(expectedPage)); }); + expectedClientRegexes.forEach((expectedRegex) => { + // Ensure each client page regular expression matches at least one + // generated page. + let matched = false; + for (const page of allPages) { + if (expectedRegex.test(page)) { + matched = true; + break; + } + } + assert.ok(matched); + }); }); }); }