Investigate: merge different paths in Algolia's crawler
The Algolia crawler parses the whole site based on a configuration file that is defined at https://crawler.algolia.com/admin/crawlers/d46abdc0-bb41-4d50-95b7-a3e1fe6469a4/configuration/edit.
Our config has four different paths based on the four different products:
Click to expand
actions: [
{
indexName: "gitlab",
pathsToMatch: ["https://docs.gitlab.com/ee/**"],
recordExtractor: ({ $, helpers }) => {
// Stop if one of those text is found in the DOM.
const body = $.text();
const toCheck = [
"This document was moved to",
"This section is now merged into",
"404 Not Found",
];
const shouldStop = toCheck.some((text) => body.includes(text));
if (shouldStop) {
return [];
} // Removing DOM elements we don't want to crawl
const toRemove = "#markdown-toc";
$(toRemove).remove();
return helpers.docsearch({
recordProps: {
lvl1: ".article-content h1",
content:
".article-content p, .article-content li, .article-content td:last-child, .article-content pre.highlight code",
lvl0: {
selectors: ".article-content h1",
defaultValue: "Documentation",
},
lvl2: ".article-content h2",
lvl3: ".article-content h3",
lvl4: ".article-content h4",
lvl5: ".article-content h5, .article-content td:first-child",
product: {
defaultValue: "GitLab",
},
tags: {
defaultValue: ["gitlab"],
},
},
indexHeadings: true,
});
},
},
{
indexName: "gitlab",
pathsToMatch: ["https://docs.gitlab.com/omnibus/**"],
recordExtractor: ({ $, helpers }) => {
// Stop if one of those text is found in the DOM.
const body = $.text();
const toCheck = [
"This document was moved to",
"This section is now merged into",
"404 Not Found",
];
const shouldStop = toCheck.some((text) => body.includes(text));
if (shouldStop) {
return [];
} // Removing DOM elements we don't want to crawl
const toRemove = "#markdown-toc";
$(toRemove).remove();
return helpers.docsearch({
recordProps: {
lvl1: ".article-content h1",
content:
".article-content p, .article-content li, .article-content td:last-child, .article-content pre.highlight code",
lvl0: {
selectors: ".article-content h1",
defaultValue: "Documentation",
},
lvl2: ".article-content h2",
lvl3: ".article-content h3",
lvl4: ".article-content h4",
lvl5: ".article-content h5, .article-content td:first-child",
product: {
defaultValue: "Omnibus GitLab",
},
tags: {
defaultValue: ["omnibus"],
},
},
indexHeadings: true,
});
},
},
{
indexName: "gitlab",
pathsToMatch: ["https://docs.gitlab.com/runner/**"],
recordExtractor: ({ $, helpers }) => {
// Stop if one of those text is found in the DOM.
const body = $.text();
const toCheck = [
"This document was moved to",
"This section is now merged into",
"404 Not Found",
];
const shouldStop = toCheck.some((text) => body.includes(text));
if (shouldStop) {
return [];
} // Removing DOM elements we don't want to crawl
const toRemove = "#markdown-toc";
$(toRemove).remove();
return helpers.docsearch({
recordProps: {
lvl1: ".article-content h1",
content:
".article-content p, .article-content li, .article-content td:last-child, .article-content pre.highlight code",
lvl0: {
selectors: ".article-content h1",
defaultValue: "Documentation",
},
lvl2: ".article-content h2",
lvl3: ".article-content h3",
lvl4: ".article-content h4",
lvl5: ".article-content h5, .article-content td:first-child",
product: {
defaultValue: "GitLab Runner",
},
tags: {
defaultValue: ["runner"],
},
},
indexHeadings: true,
});
},
},
{
indexName: "gitlab",
pathsToMatch: ["https://docs.gitlab.com/charts/**"],
recordExtractor: ({ $, helpers }) => {
// Stop if one of those text is found in the DOM.
const body = $.text();
const toCheck = [
"This document was moved to",
"This section is now merged into",
"404 Not Found",
];
const shouldStop = toCheck.some((text) => body.includes(text));
if (shouldStop) {
return [];
} // Removing DOM elements we don't want to crawl
const toRemove = "#markdown-toc";
$(toRemove).remove();
return helpers.docsearch({
recordProps: {
lvl1: ".article-content h1",
content:
".article-content p, .article-content li, .article-content td:last-child, .article-content pre.highlight code",
lvl0: {
selectors: ".article-content h1",
defaultValue: "Documentation",
},
lvl2: ".article-content h2",
lvl3: ".article-content h3",
lvl4: ".article-content h4",
lvl5: ".article-content h5, .article-content td:first-child",
product: {
defaultValue: "GitLab Helm Charts",
},
tags: {
defaultValue: ["charts"],
},
},
indexHeadings: true,
});
},
},
],
With this functionality, we can filter the results based on the page's path, and in effect based on the product. Algolia calls this tags. This is currently shown only under https://docs.gitlab.com/search/, and you can see it in action, for example https://docs.gitlab.com/search/?query=install, where you can filter among the four products.
The question is, do we still need that functionality? We're moving more and more docs under gitlab-org/gitlab
, and we won't be able to take advantage of tags filtering.
Proposal
Get rid of the tags and have only one path to crawl.
Relevant issues/MRs
Some relevant MRs that added this functionality: