mirror of
https://github.com/kevin-DL/sapper.git
synced 2026-01-23 15:41:32 +00:00
Removes all async/await from the extraction pipeline, and adds unit tests for extracted client pages that match a regular expression
This commit is contained in:
@@ -98,7 +98,7 @@ function getChunkFiles() {
|
|||||||
* @param {number=} extractionDir The directory in which to place the extracted
|
* @param {number=} extractionDir The directory in which to place the extracted
|
||||||
* output.
|
* output.
|
||||||
*/
|
*/
|
||||||
module.exports = async function(includeUrls = null, excludeUrls = null,
|
module.exports = function(includeUrls = null, excludeUrls = null,
|
||||||
extractionDir = OUTPUT_DIR) {
|
extractionDir = OUTPUT_DIR) {
|
||||||
// Set up the server.
|
// Set up the server.
|
||||||
|
|
||||||
@@ -127,97 +127,98 @@ module.exports = async function(includeUrls = null, excludeUrls = null,
|
|||||||
// scraper. The program automatically exits after all the static pages have
|
// scraper. The program automatically exits after all the static pages have
|
||||||
// been scraped from the server that are accessible from the root page (`/`).
|
// been scraped from the server that are accessible from the root page (`/`).
|
||||||
const extractedFiles = []; // keep track of extracted files.
|
const extractedFiles = []; // keep track of extracted files.
|
||||||
const server = await app.listen(PORT);
|
const server = app.listen(PORT, () => {
|
||||||
console.log(`listening on port ${PORT} and beginning extraction`);
|
console.log(`listening on port ${PORT} and beginning extraction`);
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const spider = new Spider({
|
||||||
|
concurrent: 5,
|
||||||
|
delay: 0,
|
||||||
|
logs: process.stderr,
|
||||||
|
allowDuplicates: false,
|
||||||
|
catchErrors: true,
|
||||||
|
addReferrer: false,
|
||||||
|
xhr: false,
|
||||||
|
keepAlive: false,
|
||||||
|
error: (err, url) => {
|
||||||
|
console.error(`ERROR ${err} at ${url}`);
|
||||||
|
reject();
|
||||||
|
},
|
||||||
|
// Called when there are no more requests
|
||||||
|
done: () => {
|
||||||
|
server.close(() => {
|
||||||
|
console.log('Done!');
|
||||||
|
resolve();
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
headers: { 'user-agent': 'node-spider' },
|
||||||
const spider = new Spider({
|
// Use a binary encoding to preserve image files.
|
||||||
concurrent: 5,
|
encoding: 'binary'
|
||||||
delay: 0,
|
});
|
||||||
logs: process.stderr,
|
|
||||||
allowDuplicates: false,
|
|
||||||
catchErrors: true,
|
|
||||||
addReferrer: false,
|
|
||||||
xhr: false,
|
|
||||||
keepAlive: false,
|
|
||||||
error: (err, url) => {
|
|
||||||
console.error(`ERROR ${err} at ${url}`);
|
|
||||||
reject();
|
|
||||||
},
|
|
||||||
// Called when there are no more requests
|
|
||||||
done: async () => {
|
|
||||||
await server.close();
|
|
||||||
console.log('Done!');
|
|
||||||
resolve();
|
|
||||||
},
|
|
||||||
|
|
||||||
headers: { 'user-agent': 'node-spider' },
|
// The primary logic to handle a scraped page.
|
||||||
// Use a binary encoding to preserve image files.
|
const handleRequest = (doc) => {
|
||||||
encoding: 'binary'
|
// Only deal with the page if it is on the server, i.e. it is not an
|
||||||
});
|
// external link.
|
||||||
|
if (!filter(doc.url)) return;
|
||||||
|
// Skip URL if it is in the exclude list.
|
||||||
|
if (excludeUrls.includes(getFullUrl(doc.url))) return;
|
||||||
|
|
||||||
// The primary logic to handle a scraped page.
|
// Grab the page's relative path and write the page contents to a local
|
||||||
const handleRequest = (doc) => {
|
// file.
|
||||||
// Only deal with the page if it is on the server, i.e. it is not an
|
const relPath = relativePath(doc.url);
|
||||||
// external link.
|
extractedFiles.push(relPath);
|
||||||
if (!filter(doc.url)) return;
|
console.log(`GOT ${relPath}`); // static page url
|
||||||
// Skip URL if it is in the exclude list.
|
fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body,
|
||||||
if (excludeUrls.includes(getFullUrl(doc.url))) return;
|
{encoding: 'binary'});
|
||||||
|
|
||||||
// Grab the page's relative path and write the page contents to a local
|
/**
|
||||||
// file.
|
* Resolves and checks if a given URL is local; if so, adds it to the
|
||||||
const relPath = relativePath(doc.url);
|
* scraping queue.
|
||||||
extractedFiles.push(relPath);
|
* @param {string} url The URL to process.
|
||||||
console.log(`GOT ${relPath}`); // static page url
|
*/
|
||||||
fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body,
|
const process = (url) => {
|
||||||
{encoding: 'binary'});
|
// Remove trailing hash if relevant.
|
||||||
|
url = url.split('#')[0];
|
||||||
|
// Resolve URL relative to server root.
|
||||||
|
url = doc.resolve(url);
|
||||||
|
// Crawl more if the URL is on the server.
|
||||||
|
if (filter(url)) spider.queue(url, handleRequest);
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
const extension = getExtension(relPath);
|
||||||
* Resolves and checks if a given URL is local; if so, adds it to the
|
if (extension == 'html') {
|
||||||
* scraping queue.
|
// Grab src and href attributes from html pages.
|
||||||
* @param {string} url The URL to process.
|
doc.$('[src]').each((i, elem) => {
|
||||||
*/
|
process(doc.$(elem).attr('src'));
|
||||||
const process = (url) => {
|
});
|
||||||
// Remove trailing hash if relevant.
|
doc.$('[href]').each((i, elem) => {
|
||||||
url = url.split('#')[0];
|
process(doc.$(elem).attr('href'));
|
||||||
// Resolve URL relative to server root.
|
});
|
||||||
url = doc.resolve(url);
|
}
|
||||||
// Crawl more if the URL is on the server.
|
|
||||||
if (filter(url)) spider.queue(url, handleRequest);
|
if (doc.url.endsWith('/service-worker.js')) {
|
||||||
|
// Grab additional routes.
|
||||||
|
const chunkFiles = getChunkFiles();
|
||||||
|
chunkFiles.forEach(
|
||||||
|
(url) => spider.queue(getFullUrl(url), handleRequest));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) {
|
||||||
|
// Attempt to grab the /api/ version of a page that seems to be a
|
||||||
|
// basic route.
|
||||||
|
spider.queue(apiPath(doc.url), handleRequest);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const extension = getExtension(relPath);
|
// Start crawling with the document root and the service worker.
|
||||||
if (extension == 'html') {
|
spider.queue(getFullUrl('/'), handleRequest);
|
||||||
// Grab src and href attributes from html pages.
|
spider.queue(getFullUrl('/service-worker.js'), handleRequest);
|
||||||
doc.$('[src]').each((i, elem) => {
|
|
||||||
process(doc.$(elem).attr('src'));
|
|
||||||
});
|
|
||||||
doc.$('[href]').each((i, elem) => {
|
|
||||||
process(doc.$(elem).attr('href'));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (doc.url.endsWith('/service-worker.js')) {
|
if (includeUrls !== null) {
|
||||||
// Grab additional routes.
|
includeUrls.forEach(
|
||||||
const chunkFiles = getChunkFiles();
|
|
||||||
chunkFiles.forEach(
|
|
||||||
(url) => spider.queue(getFullUrl(url), handleRequest));
|
(url) => spider.queue(getFullUrl(url), handleRequest));
|
||||||
}
|
}
|
||||||
|
});
|
||||||
if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) {
|
|
||||||
// Attempt to grab the /api/ version of a page that seems to be a
|
|
||||||
// basic route.
|
|
||||||
spider.queue(apiPath(doc.url), handleRequest);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Start crawling with the document root and the service worker.
|
|
||||||
spider.queue(getFullUrl('/'), handleRequest);
|
|
||||||
spider.queue(getFullUrl('/service-worker.js'), handleRequest);
|
|
||||||
|
|
||||||
if (includeUrls !== null) {
|
|
||||||
includeUrls.forEach(
|
|
||||||
(url) => spider.queue(getFullUrl(url), handleRequest));
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -322,6 +322,9 @@ function run(env) {
|
|||||||
'about/index.html',
|
'about/index.html',
|
||||||
'api/about/index.html',
|
'api/about/index.html',
|
||||||
|
|
||||||
|
'slow-preload/index.html',
|
||||||
|
'api/slow-preload/index.html',
|
||||||
|
|
||||||
'blog/index.html',
|
'blog/index.html',
|
||||||
'api/blog/index.html',
|
'api/blog/index.html',
|
||||||
|
|
||||||
@@ -351,11 +354,33 @@ function run(env) {
|
|||||||
'svelte-logo-192.png',
|
'svelte-logo-192.png',
|
||||||
'svelte-logo-512.png',
|
'svelte-logo-512.png',
|
||||||
];
|
];
|
||||||
|
// Client scripts that should show up in the extraction directory.
|
||||||
|
const expectedClientRegexes = [
|
||||||
|
/client\/_\..*?\.js/,
|
||||||
|
/client\/about\..*?\.js/,
|
||||||
|
/client\/blog_\$slug\$\..*?\.js/,
|
||||||
|
/client\/blog\..*?\.js/,
|
||||||
|
/client\/main\..*?\.js/,
|
||||||
|
/client\/show_url\..*?\.js/,
|
||||||
|
/client\/slow_preload\..*?\.js/,
|
||||||
|
];
|
||||||
const allPages = walkSync(dest);
|
const allPages = walkSync(dest);
|
||||||
|
|
||||||
expectedPages.forEach((expectedPage) => {
|
expectedPages.forEach((expectedPage) => {
|
||||||
assert.ok(allPages.includes(expectedPage));
|
assert.ok(allPages.includes(expectedPage));
|
||||||
});
|
});
|
||||||
|
expectedClientRegexes.forEach((expectedRegex) => {
|
||||||
|
// Ensure each client page regular expression matches at least one
|
||||||
|
// generated page.
|
||||||
|
let matched = false;
|
||||||
|
for (const page of allPages) {
|
||||||
|
if (expectedRegex.test(page)) {
|
||||||
|
matched = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert.ok(matched);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user