Parallel processing [note]
Hello, just sharing a note. We use html-validate in CI and it is the main use of time in our CI.
Our html-validate also does external links caching which uses cURL. You can see that here https://github.com/fulldecent/github-pages-template
So we want to run it in parallel for each processor. Here is how that runs:
Main file
// html-validate.mjs
import { globSync } from "glob";
import { Worker } from "worker_threads";
import path from "path";
import { fileURLToPath } from "url";
import fs from "fs";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// --- Configuration ---
// In the future, the CLI may improve and this script may be unnecessary.
// SEE: https://gitlab.com/html-validate/html-validate/-/issues/271
const MAX_WORKERS = parseInt(process.env.HTML_VALIDATE_WORKERS) || 4;
const WORKER_SCRIPT_PATH = path.join(__dirname, "html-validate-worker.mjs");
/**
* Gathers target HTML files from command-line arguments or a default directory.
* @returns {string[]} A sorted and deduplicated array of HTML file paths.
*/
function getTargetFiles() {
const args = process.argv.slice(2);
// If no arguments are provided, use the default glob pattern.
if (args.length === 0) {
console.log("ℹ️ No paths provided. Searching for HTML files in `build/` directory...");
return globSync("build/**/*.html").sort();
}
// If arguments are provided, process them into a list of glob patterns.
const patterns = args.map((arg) => {
try {
// Check if the argument is a directory.
if (fs.statSync(arg).isDirectory()) {
// If it is, create a glob pattern to find all HTML files within it.
return path.join(arg, "**", "*.html");
}
} catch (error) {
// If fs.statSync fails, the path might not exist or isn't a directory.
// In that case, we assume it's a file path or a glob pattern and use it directly.
}
// Return the argument as-is for glob to process.
return arg;
});
console.log(`ℹ️ Searching for files matching: ${patterns.join(", ")}`);
// Use glob to find all files matching the generated patterns.
const files = globSync(patterns, { nodir: true });
// Return a deduplicated and sorted list of files.
return [...new Set(files)].sort();
}
const targets = getTargetFiles();
if (targets.length === 0) {
console.log("⚠️ No HTML files found in build directory");
console.log(" Make sure to build the site first");
process.exit(0);
}
console.log(`🧪 Validating ${targets.length} files with up to ${MAX_WORKERS} parallel workers...`);
await validateParallel();
async function validateParallel() {
let completedTasks = 0;
let allTestsPassed = true;
const results = [];
const workers = [];
const taskQueue = [...targets];
let isDone = false;
function completeParallelProcessing() {
if (isDone) return;
isDone = true;
workers.forEach((worker) => worker.terminate());
const failedResults = results.filter((r) => !r.isValid);
const passedCount = results.length - failedResults.length;
console.log("\n📊 Results summary:");
console.log(`✅ ${passedCount} files passed validation`);
if (failedResults.length > 0) {
console.log(`❌ ${failedResults.length} files failed validation`);
process.exit(1);
} else {
console.log("✨ All tests passed!\n");
}
}
function createWorker(workerId) {
const worker = new Worker(WORKER_SCRIPT_PATH);
worker.on("message", (result) => {
completedTasks++;
results.push(result);
const relativeFilePath = path.relative(process.cwd(), result.filePath);
if (!result.isValid) {
allTestsPassed = false;
console.log(`❌ (${completedTasks} of ${targets.length}) ${relativeFilePath}`);
// Print error messages with indentation
const errorLines = result.message.trim().split("\n");
errorLines.forEach((line) => {
console.log(`- ${line}`);
});
} else {
console.log(`✅ (${completedTasks} of ${targets.length}) ${relativeFilePath}`);
}
if (taskQueue.length > 0) {
const nextTask = taskQueue.shift();
worker.postMessage({ filePath: nextTask, workerId });
}
if (completedTasks === targets.length) {
completeParallelProcessing();
}
});
worker.on("error", (error) => {
console.error(`Worker ${workerId} error:`, error);
allTestsPassed = false;
completeParallelProcessing();
});
return worker;
}
function startParallelProcessing() {
const workerCount = Math.min(MAX_WORKERS, taskQueue.length);
for (let i = 0; i < workerCount; i++) {
const worker = createWorker(i);
workers.push(worker);
}
workers.forEach((worker, i) => {
if (taskQueue.length > 0) {
const task = taskQueue.shift();
worker.postMessage({ filePath: task, workerId: i });
}
});
}
return new Promise((resolve) => {
const originalComplete = completeParallelProcessing;
completeParallelProcessing = () => {
originalComplete();
resolve();
};
if (targets.length > 0) {
startParallelProcessing();
} else {
completeParallelProcessing();
}
});
}
Worker mjs
// html-validate-worker.mjs
import { parentPort } from "worker_threads";
import { HtmlValidate, FileSystemConfigLoader, formatterFactory, esmResolver } from "html-validate";
// Initialize HtmlValidate instance (same as main script)
const resolver = esmResolver();
const loader = new FileSystemConfigLoader([resolver]);
const htmlValidate = new HtmlValidate(loader);
const formatter = formatterFactory("text");
// Listen for messages from parent thread
parentPort.on("message", async (data) => {
const { filePath, workerId } = data;
try {
const report = await htmlValidate.validateFile(filePath);
const result = {
workerId,
filePath,
success: report.valid,
message: report.valid ? `✅ ${filePath}` : formatter(report.results),
isValid: report.valid,
report: report,
};
parentPort.postMessage(result);
} catch (error) {
const result = {
workerId,
filePath,
success: false,
message: `❌ Error validating`,
isValid: false,
};
parentPort.postMessage(result);
}
});
Anybody is welcome to steal this for yourself.
I'm not sure if a future version of html-validate will do some of this by itself with the main bin/ file. But just sharing what we have now in case it could help anyone.
Edited by William Entriken