Skip to content

Refactor aggregation to use terms vs. composite

Summary

The language aggregations used for blob search currently use composite aggregation. I found during testing that there are some issues with the aggregation buckets returned:

  • buckets are returned by natural ordering (not by count descending)
  • there are performance implications for using composite aggregations

Steps to reproduce

Run the query below against staging and use a common search term '*' or 'test' to see the difference in aggregations returned.

blob Elasticsearch query
{
  "query": {
    "bool": {
      "must": {
        "simple_query_string": {
          "_name": "blob:match:search_terms",
          "fields": [
            "blob.content",
            "blob.file_name",
            "blob.path"
          ],
          "query": "*",
          "default_operator": "and"
        }
      },
      "must_not": [],
      "should": [],
      "filter": [
        {
          "has_parent": {
            "_name": "blob:authorized:project",
            "parent_type": "project",
            "query": {
              "bool": {
                "should": [
                  {
                    "bool": {
                      "filter": [
                        {
                          "term": {
                            "visibility_level": {
                              "_name": "blob:authorized:project:any",
                              "value": 0
                            }
                          }
                        },
                        {
                          "terms": {
                            "_name": "blob:authorized:project:repository:enabled_or_private",
                            "repository_access_level": [
                              20,
                              10
                            ]
                          }
                        }
                      ]
                    }
                  },
                  {
                    "bool": {
                      "_name": "blob:authorized:project:visibility:10:repository:access_level",
                      "filter": [
                        {
                          "term": {
                            "visibility_level": {
                              "_name": "blob:authorized:project:visibility:10",
                              "value": 10
                            }
                          }
                        },
                        {
                          "terms": {
                            "_name": "blob:authorized:project:visibility:10:repository:access_level:enabled_or_private",
                            "repository_access_level": [
                              20,
                              10
                            ]
                          }
                        }
                      ]
                    }
                  },
                  {
                    "bool": {
                      "_name": "blob:authorized:project:visibility:20:repository:access_level",
                      "filter": [
                        {
                          "term": {
                            "visibility_level": {
                              "_name": "blob:authorized:project:visibility:20",
                              "value": 20
                            }
                          }
                        },
                        {
                          "terms": {
                            "_name": "blob:authorized:project:visibility:20:repository:access_level:enabled_or_private",
                            "repository_access_level": [
                              20,
                              10
                            ]
                          }
                        }
                      ]
                    }
                  }
                ]
              }
            }
          }
        },
        {
          "term": {
            "type": {
              "_name": "doc:is_a:blob",
              "value": "blob"
            }
          }
        }
      ]
    }
  },
  "size": 20,
  "from": 0,
  "sort": [
    "_score"
  ],
  "highlight": {
    "pre_tags": [
      "gitlabelasticsearch→"
    ],
    "post_tags": [
      "←gitlabelasticsearch"
    ],
    "number_of_fragments": 0,
    "fields": {
      "blob.content": {},
      "blob.file_name": {}
    }
  },
  "aggs": {
    "lang": {
      "terms": {
        "field": "blob.language",
        "size": 100
      }
    },
    "language": {
      "composite": {
        "sources": [
          {
            "language": {
              "terms": {
                "field": "blob.language"
              }
            }
          }
        ]
      }
    }
  }
}

composite aggregation (current)

drawing

terms aggregation (proposed)

drawing

Possible fixes

I recommend we switch to a terms aggregation due to the performance improvements and the automatic sort by count descending. I've tested this out in staging and it works.

Edited by Terri Chu