Removes all async/await from the extraction pipeline, and adds unit tests for extracted client pages that match a regular expression

2026-03-16 05:55:05 +00:00 · 2018-01-05 14:56:58 -08:00
parent fc8280adea
commit 7588911108
2 changed files with 109 additions and 83 deletions
--- a/lib/utils/extract.js
+++ b/lib/utils/extract.js
@@ -98,7 +98,7 @@ function getChunkFiles() {
 * @param {number=} extractionDir The directory in which to place the extracted
 *     output.
 */
-module.exports = async function(includeUrls = null, excludeUrls = null,
+module.exports = function(includeUrls = null, excludeUrls = null,
    extractionDir = OUTPUT_DIR) {
  // Set up the server.
@@ -127,97 +127,98 @@ module.exports = async function(includeUrls = null, excludeUrls = null,
  // scraper. The program automatically exits after all the static pages have
  // been scraped from the server that are accessible from the root page (`/`).
  const extractedFiles = []; // keep track of extracted files.
-  const server = await app.listen(PORT);
+  const server = app.listen(PORT, () => {
-  console.log(`listening on port ${PORT} and beginning extraction`);
+    console.log(`listening on port ${PORT} and beginning extraction`);
    return new Promise((resolve, reject) => {
      const spider = new Spider({
        concurrent: 5,
        delay: 0,
        logs: process.stderr,
        allowDuplicates: false,
        catchErrors: true,
        addReferrer: false,
        xhr: false,
        keepAlive: false,
        error: (err, url) => {
          console.error(`ERROR ${err} at ${url}`);
          reject();
        },
        // Called when there are no more requests
        done: () => {
          server.close(() => {
            console.log('Done!');
            resolve();
          });
        },
-  return new Promise((resolve, reject) => {
+        headers: { 'user-agent': 'node-spider' },
-    const spider = new Spider({
+        // Use a binary encoding to preserve image files.
-      concurrent: 5,
+        encoding: 'binary'
-      delay: 0,
+      });
      logs: process.stderr,
      allowDuplicates: false,
      catchErrors: true,
      addReferrer: false,
      xhr: false,
      keepAlive: false,
      error: (err, url) => {
        console.error(`ERROR ${err} at ${url}`);
        reject();
      },
      // Called when there are no more requests
      done: async () => {
        await server.close();
        console.log('Done!');
        resolve();
      },
-      headers: { 'user-agent': 'node-spider' },
+      // The primary logic to handle a scraped page.
-      // Use a binary encoding to preserve image files.
+      const handleRequest = (doc) => {
-      encoding: 'binary'
+        // Only deal with the page if it is on the server, i.e. it is not an
-    });
+        // external link.
        if (!filter(doc.url)) return;
        // Skip URL if it is in the exclude list.
        if (excludeUrls.includes(getFullUrl(doc.url))) return;
-    // The primary logic to handle a scraped page.
+        // Grab the page's relative path and write the page contents to a local
-    const handleRequest = (doc) => {
+        // file.
-      // Only deal with the page if it is on the server, i.e. it is not an
+        const relPath = relativePath(doc.url);
-      // external link.
+        extractedFiles.push(relPath);
-      if (!filter(doc.url)) return;
+        console.log(`GOT ${relPath}`); // static page url
-      // Skip URL if it is in the exclude list.
+        fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body,
-      if (excludeUrls.includes(getFullUrl(doc.url))) return;
+            {encoding: 'binary'});
-      // Grab the page's relative path and write the page contents to a local
+        /**
-      // file.
+         * Resolves and checks if a given URL is local; if so, adds it to the
-      const relPath = relativePath(doc.url);
+         * scraping queue.
-      extractedFiles.push(relPath);
+         * @param {string} url The URL to process.
-      console.log(`GOT ${relPath}`); // static page url
+         */
-      fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body,
+        const process = (url) => {
-          {encoding: 'binary'});
+          // Remove trailing hash if relevant.
          url = url.split('#')[0];
          // Resolve URL relative to server root.
          url = doc.resolve(url);
          // Crawl more if the URL is on the server.
          if (filter(url)) spider.queue(url, handleRequest);
        };
-      /**
+        const extension = getExtension(relPath);
-       * Resolves and checks if a given URL is local; if so, adds it to the
+        if (extension == 'html') {
-       * scraping queue.
+          // Grab src and href attributes from html pages.
-       * @param {string} url The URL to process.
+          doc.$('[src]').each((i, elem) => {
-       */
+            process(doc.$(elem).attr('src'));
-      const process = (url) => {
+          });
-        // Remove trailing hash if relevant.
+          doc.$('[href]').each((i, elem) => {
-        url = url.split('#')[0];
+            process(doc.$(elem).attr('href'));
-        // Resolve URL relative to server root.
+          });
-        url = doc.resolve(url);
+        }
-        // Crawl more if the URL is on the server.
+
-        if (filter(url)) spider.queue(url, handleRequest);
+        if (doc.url.endsWith('/service-worker.js')) {
          // Grab additional routes.
          const chunkFiles = getChunkFiles();
          chunkFiles.forEach(
              (url) => spider.queue(getFullUrl(url), handleRequest));
        }
        if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) {
          // Attempt to grab the /api/ version of a page that seems to be a
          // basic route.
          spider.queue(apiPath(doc.url), handleRequest);
        }
      };
-      const extension = getExtension(relPath);
+      // Start crawling with the document root and the service worker.
-      if (extension == 'html') {
+      spider.queue(getFullUrl('/'), handleRequest);
-        // Grab src and href attributes from html pages.
+      spider.queue(getFullUrl('/service-worker.js'), handleRequest);
        doc.$('[src]').each((i, elem) => {
          process(doc.$(elem).attr('src'));
        });
        doc.$('[href]').each((i, elem) => {
          process(doc.$(elem).attr('href'));
        });
      }
-      if (doc.url.endsWith('/service-worker.js')) {
+      if (includeUrls !== null) {
-        // Grab additional routes.
+        includeUrls.forEach(
        const chunkFiles = getChunkFiles();
        chunkFiles.forEach(
            (url) => spider.queue(getFullUrl(url), handleRequest));
      }
-
+    });
      if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) {
        // Attempt to grab the /api/ version of a page that seems to be a
        // basic route.
        spider.queue(apiPath(doc.url), handleRequest);
      }
    };
    // Start crawling with the document root and the service worker.
    spider.queue(getFullUrl('/'), handleRequest);
    spider.queue(getFullUrl('/service-worker.js'), handleRequest);
    if (includeUrls !== null) {
      includeUrls.forEach(
          (url) => spider.queue(getFullUrl(url), handleRequest));
    }
  });
 }
--- a/test/common/test.js
+++ b/test/common/test.js
@@ -322,6 +322,9 @@ function run(env) {
 						'about/index.html',
 						'api/about/index.html',
 						'slow-preload/index.html',
 						'api/slow-preload/index.html',
 						'blog/index.html',
 						'api/blog/index.html',
@@ -351,11 +354,33 @@ function run(env) {
 						'svelte-logo-192.png',
 						'svelte-logo-512.png',
 					];
 					// Client scripts that should show up in the extraction directory.
 					const expectedClientRegexes = [
 						/client\/_\..*?\.js/,
 						/client\/about\..*?\.js/,
 						/client\/blog_\$slug\$\..*?\.js/,
 						/client\/blog\..*?\.js/,
 						/client\/main\..*?\.js/,
 						/client\/show_url\..*?\.js/,
 						/client\/slow_preload\..*?\.js/,
 					];
 					const allPages = walkSync(dest);
 					expectedPages.forEach((expectedPage) => {
 						assert.ok(allPages.includes(expectedPage));
 					});
 					expectedClientRegexes.forEach((expectedRegex) => {
 						// Ensure each client page regular expression matches at least one
 						// generated page.
 						let matched = false;
 						for (const page of allPages) {
 							if (expectedRegex.test(page)) {
 								matched = true;
 								break;
 							}
 						}
 						assert.ok(matched);
 					});
 				});
 			});
 		}