Removes all async/await from the extraction pipeline, and adds unit tests for extracted client pages that match a regular expression

2026-03-12 12:14:54 +00:00 · 2018-01-05 14:56:58 -08:00
parent fc8280adea
commit 7588911108
2 changed files with 109 additions and 83 deletions
--- a/lib/utils/extract.js
+++ b/lib/utils/extract.js
@@ -98,7 +98,7 @@ function getChunkFiles() {
 * @param {number=} extractionDir The directory in which to place the extracted
 *     output.
 */
-module.exports = async function(includeUrls = null, excludeUrls = null,
+module.exports = function(includeUrls = null, excludeUrls = null,
    extractionDir = OUTPUT_DIR) {
  // Set up the server.

@@ -127,97 +127,98 @@ module.exports = async function(includeUrls = null, excludeUrls = null,
  // scraper. The program automatically exits after all the static pages have
  // been scraped from the server that are accessible from the root page (`/`).
  const extractedFiles = []; // keep track of extracted files.
-  const server = await app.listen(PORT);
-  console.log(`listening on port ${PORT} and beginning extraction`);
+  const server = app.listen(PORT, () => {
+    console.log(`listening on port ${PORT} and beginning extraction`);
+    return new Promise((resolve, reject) => {
+      const spider = new Spider({
+        concurrent: 5,
+        delay: 0,
+        logs: process.stderr,
+        allowDuplicates: false,
+        catchErrors: true,
+        addReferrer: false,
+        xhr: false,
+        keepAlive: false,
+        error: (err, url) => {
+          console.error(`ERROR ${err} at ${url}`);
+          reject();
+        },
+        // Called when there are no more requests
+        done: () => {
+          server.close(() => {
+            console.log('Done!');
+            resolve();
+          });
+        },

-  return new Promise((resolve, reject) => {
-    const spider = new Spider({
-      concurrent: 5,
-      delay: 0,
-      logs: process.stderr,
-      allowDuplicates: false,
-      catchErrors: true,
-      addReferrer: false,
-      xhr: false,
-      keepAlive: false,
-      error: (err, url) => {
-        console.error(`ERROR ${err} at ${url}`);
-        reject();
-      },
-      // Called when there are no more requests
-      done: async () => {
-        await server.close();
-        console.log('Done!');
-        resolve();
-      },
+        headers: { 'user-agent': 'node-spider' },
+        // Use a binary encoding to preserve image files.
+        encoding: 'binary'
+      });

-      headers: { 'user-agent': 'node-spider' },
-      // Use a binary encoding to preserve image files.
-      encoding: 'binary'
-    });
+      // The primary logic to handle a scraped page.
+      const handleRequest = (doc) => {
+        // Only deal with the page if it is on the server, i.e. it is not an
+        // external link.
+        if (!filter(doc.url)) return;
+        // Skip URL if it is in the exclude list.
+        if (excludeUrls.includes(getFullUrl(doc.url))) return;

-    // The primary logic to handle a scraped page.
-    const handleRequest = (doc) => {
-      // Only deal with the page if it is on the server, i.e. it is not an
-      // external link.
-      if (!filter(doc.url)) return;
-      // Skip URL if it is in the exclude list.
-      if (excludeUrls.includes(getFullUrl(doc.url))) return;
+        // Grab the page's relative path and write the page contents to a local
+        // file.
+        const relPath = relativePath(doc.url);
+        extractedFiles.push(relPath);
+        console.log(`GOT ${relPath}`); // static page url
+        fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body,
+            {encoding: 'binary'});

-      // Grab the page's relative path and write the page contents to a local
-      // file.
-      const relPath = relativePath(doc.url);
-      extractedFiles.push(relPath);
-      console.log(`GOT ${relPath}`); // static page url
-      fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body,
-          {encoding: 'binary'});
+        /**
+         * Resolves and checks if a given URL is local; if so, adds it to the
+         * scraping queue.
+         * @param {string} url The URL to process.
+         */
+        const process = (url) => {
+          // Remove trailing hash if relevant.
+          url = url.split('#')[0];
+          // Resolve URL relative to server root.
+          url = doc.resolve(url);
+          // Crawl more if the URL is on the server.
+          if (filter(url)) spider.queue(url, handleRequest);
+        };

-      /**
-       * Resolves and checks if a given URL is local; if so, adds it to the
-       * scraping queue.
-       * @param {string} url The URL to process.
-       */
-      const process = (url) => {
-        // Remove trailing hash if relevant.
-        url = url.split('#')[0];
-        // Resolve URL relative to server root.
-        url = doc.resolve(url);
-        // Crawl more if the URL is on the server.
-        if (filter(url)) spider.queue(url, handleRequest);
+        const extension = getExtension(relPath);
+        if (extension == 'html') {
+          // Grab src and href attributes from html pages.
+          doc.$('[src]').each((i, elem) => {
+            process(doc.$(elem).attr('src'));
+          });
+          doc.$('[href]').each((i, elem) => {
+            process(doc.$(elem).attr('href'));
+          });
+        }
+
+        if (doc.url.endsWith('/service-worker.js')) {
+          // Grab additional routes.
+          const chunkFiles = getChunkFiles();
+          chunkFiles.forEach(
+              (url) => spider.queue(getFullUrl(url), handleRequest));
+        }
+
+        if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) {
+          // Attempt to grab the /api/ version of a page that seems to be a
+          // basic route.
+          spider.queue(apiPath(doc.url), handleRequest);
+        }
      };

-      const extension = getExtension(relPath);
-      if (extension == 'html') {
-        // Grab src and href attributes from html pages.
-        doc.$('[src]').each((i, elem) => {
-          process(doc.$(elem).attr('src'));
-        });
-        doc.$('[href]').each((i, elem) => {
-          process(doc.$(elem).attr('href'));
-        });
-      }
+      // Start crawling with the document root and the service worker.
+      spider.queue(getFullUrl('/'), handleRequest);
+      spider.queue(getFullUrl('/service-worker.js'), handleRequest);

-      if (doc.url.endsWith('/service-worker.js')) {
-        // Grab additional routes.
-        const chunkFiles = getChunkFiles();
-        chunkFiles.forEach(
+      if (includeUrls !== null) {
+        includeUrls.forEach(
            (url) => spider.queue(getFullUrl(url), handleRequest));
      }
-
-      if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) {
-        // Attempt to grab the /api/ version of a page that seems to be a
-        // basic route.
-        spider.queue(apiPath(doc.url), handleRequest);
-      }
-    };
-
-    // Start crawling with the document root and the service worker.
-    spider.queue(getFullUrl('/'), handleRequest);
-    spider.queue(getFullUrl('/service-worker.js'), handleRequest);
-
-    if (includeUrls !== null) {
-      includeUrls.forEach(
-          (url) => spider.queue(getFullUrl(url), handleRequest));
-    }
+    });
  });
 }
--- a/test/common/test.js
+++ b/test/common/test.js
@@ -322,6 +322,9 @@ function run(env) {
 						'about/index.html',
 						'api/about/index.html',

+						'slow-preload/index.html',
+						'api/slow-preload/index.html',
+
 						'blog/index.html',
 						'api/blog/index.html',

@@ -351,11 +354,33 @@ function run(env) {
 						'svelte-logo-192.png',
 						'svelte-logo-512.png',
 					];
+					// Client scripts that should show up in the extraction directory.
+					const expectedClientRegexes = [
+						/client\/_\..*?\.js/,
+						/client\/about\..*?\.js/,
+						/client\/blog_\$slug\$\..*?\.js/,
+						/client\/blog\..*?\.js/,
+						/client\/main\..*?\.js/,
+						/client\/show_url\..*?\.js/,
+						/client\/slow_preload\..*?\.js/,
+					];
 					const allPages = walkSync(dest);

 					expectedPages.forEach((expectedPage) => {
 						assert.ok(allPages.includes(expectedPage));
 					});
+					expectedClientRegexes.forEach((expectedRegex) => {
+						// Ensure each client page regular expression matches at least one
+						// generated page.
+						let matched = false;
+						for (const page of allPages) {
+							if (expectedRegex.test(page)) {
+								matched = true;
+								break;
+							}
+						}
+						assert.ok(matched);
+					});
 				});
 			});
 		}