Removes all async/await from the extraction pipeline, and adds unit tests for extracted client pages that match a regular expression

This commit is contained in:
freedmand
2018-01-05 14:56:58 -08:00
parent fc8280adea
commit 7588911108
2 changed files with 109 additions and 83 deletions

View File

@@ -98,7 +98,7 @@ function getChunkFiles() {
* @param {number=} extractionDir The directory in which to place the extracted
* output.
*/
module.exports = async function(includeUrls = null, excludeUrls = null,
module.exports = function(includeUrls = null, excludeUrls = null,
extractionDir = OUTPUT_DIR) {
// Set up the server.
@@ -127,97 +127,98 @@ module.exports = async function(includeUrls = null, excludeUrls = null,
// scraper. The program automatically exits after all the static pages have
// been scraped from the server that are accessible from the root page (`/`).
const extractedFiles = []; // keep track of extracted files.
const server = await app.listen(PORT);
console.log(`listening on port ${PORT} and beginning extraction`);
const server = app.listen(PORT, () => {
console.log(`listening on port ${PORT} and beginning extraction`);
return new Promise((resolve, reject) => {
const spider = new Spider({
concurrent: 5,
delay: 0,
logs: process.stderr,
allowDuplicates: false,
catchErrors: true,
addReferrer: false,
xhr: false,
keepAlive: false,
error: (err, url) => {
console.error(`ERROR ${err} at ${url}`);
reject();
},
// Called when there are no more requests
done: () => {
server.close(() => {
console.log('Done!');
resolve();
});
},
return new Promise((resolve, reject) => {
const spider = new Spider({
concurrent: 5,
delay: 0,
logs: process.stderr,
allowDuplicates: false,
catchErrors: true,
addReferrer: false,
xhr: false,
keepAlive: false,
error: (err, url) => {
console.error(`ERROR ${err} at ${url}`);
reject();
},
// Called when there are no more requests
done: async () => {
await server.close();
console.log('Done!');
resolve();
},
headers: { 'user-agent': 'node-spider' },
// Use a binary encoding to preserve image files.
encoding: 'binary'
});
headers: { 'user-agent': 'node-spider' },
// Use a binary encoding to preserve image files.
encoding: 'binary'
});
// The primary logic to handle a scraped page.
const handleRequest = (doc) => {
// Only deal with the page if it is on the server, i.e. it is not an
// external link.
if (!filter(doc.url)) return;
// Skip URL if it is in the exclude list.
if (excludeUrls.includes(getFullUrl(doc.url))) return;
// The primary logic to handle a scraped page.
const handleRequest = (doc) => {
// Only deal with the page if it is on the server, i.e. it is not an
// external link.
if (!filter(doc.url)) return;
// Skip URL if it is in the exclude list.
if (excludeUrls.includes(getFullUrl(doc.url))) return;
// Grab the page's relative path and write the page contents to a local
// file.
const relPath = relativePath(doc.url);
extractedFiles.push(relPath);
console.log(`GOT ${relPath}`); // static page url
fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body,
{encoding: 'binary'});
// Grab the page's relative path and write the page contents to a local
// file.
const relPath = relativePath(doc.url);
extractedFiles.push(relPath);
console.log(`GOT ${relPath}`); // static page url
fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body,
{encoding: 'binary'});
/**
* Resolves and checks if a given URL is local; if so, adds it to the
* scraping queue.
* @param {string} url The URL to process.
*/
const process = (url) => {
// Remove trailing hash if relevant.
url = url.split('#')[0];
// Resolve URL relative to server root.
url = doc.resolve(url);
// Crawl more if the URL is on the server.
if (filter(url)) spider.queue(url, handleRequest);
};
/**
* Resolves and checks if a given URL is local; if so, adds it to the
* scraping queue.
* @param {string} url The URL to process.
*/
const process = (url) => {
// Remove trailing hash if relevant.
url = url.split('#')[0];
// Resolve URL relative to server root.
url = doc.resolve(url);
// Crawl more if the URL is on the server.
if (filter(url)) spider.queue(url, handleRequest);
const extension = getExtension(relPath);
if (extension == 'html') {
// Grab src and href attributes from html pages.
doc.$('[src]').each((i, elem) => {
process(doc.$(elem).attr('src'));
});
doc.$('[href]').each((i, elem) => {
process(doc.$(elem).attr('href'));
});
}
if (doc.url.endsWith('/service-worker.js')) {
// Grab additional routes.
const chunkFiles = getChunkFiles();
chunkFiles.forEach(
(url) => spider.queue(getFullUrl(url), handleRequest));
}
if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) {
// Attempt to grab the /api/ version of a page that seems to be a
// basic route.
spider.queue(apiPath(doc.url), handleRequest);
}
};
const extension = getExtension(relPath);
if (extension == 'html') {
// Grab src and href attributes from html pages.
doc.$('[src]').each((i, elem) => {
process(doc.$(elem).attr('src'));
});
doc.$('[href]').each((i, elem) => {
process(doc.$(elem).attr('href'));
});
}
// Start crawling with the document root and the service worker.
spider.queue(getFullUrl('/'), handleRequest);
spider.queue(getFullUrl('/service-worker.js'), handleRequest);
if (doc.url.endsWith('/service-worker.js')) {
// Grab additional routes.
const chunkFiles = getChunkFiles();
chunkFiles.forEach(
if (includeUrls !== null) {
includeUrls.forEach(
(url) => spider.queue(getFullUrl(url), handleRequest));
}
if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) {
// Attempt to grab the /api/ version of a page that seems to be a
// basic route.
spider.queue(apiPath(doc.url), handleRequest);
}
};
// Start crawling with the document root and the service worker.
spider.queue(getFullUrl('/'), handleRequest);
spider.queue(getFullUrl('/service-worker.js'), handleRequest);
if (includeUrls !== null) {
includeUrls.forEach(
(url) => spider.queue(getFullUrl(url), handleRequest));
}
});
});
}

View File

@@ -322,6 +322,9 @@ function run(env) {
'about/index.html',
'api/about/index.html',
'slow-preload/index.html',
'api/slow-preload/index.html',
'blog/index.html',
'api/blog/index.html',
@@ -351,11 +354,33 @@ function run(env) {
'svelte-logo-192.png',
'svelte-logo-512.png',
];
// Client scripts that should show up in the extraction directory.
const expectedClientRegexes = [
/client\/_\..*?\.js/,
/client\/about\..*?\.js/,
/client\/blog_\$slug\$\..*?\.js/,
/client\/blog\..*?\.js/,
/client\/main\..*?\.js/,
/client\/show_url\..*?\.js/,
/client\/slow_preload\..*?\.js/,
];
const allPages = walkSync(dest);
expectedPages.forEach((expectedPage) => {
assert.ok(allPages.includes(expectedPage));
});
expectedClientRegexes.forEach((expectedRegex) => {
// Ensure each client page regular expression matches at least one
// generated page.
let matched = false;
for (const page of allPages) {
if (expectedRegex.test(page)) {
matched = true;
break;
}
}
assert.ok(matched);
});
});
});
}