Skip to content
Snippets Groups Projects
Commit 6f70fba2 authored by Terri Chu's avatar Terri Chu :nail_care: Committed by Dmitry Gruzd
Browse files

Asynchronously recreate Tanuki Bot records

Changelog: added
EE: true
parent 95b21c4c
No related branches found
No related tags found
1 merge request!117800Add Tanuki Bot indexer
Showing
with 389 additions and 2 deletions
......@@ -789,6 +789,12 @@
Settings.cron_jobs['sync_seat_link_worker'] ||= {}
Settings.cron_jobs['sync_seat_link_worker']['cron'] ||= "#{rand(60)} #{rand(3..4)} * * * UTC"
Settings.cron_jobs['sync_seat_link_worker']['job_class'] = 'SyncSeatLinkWorker'
Settings.cron_jobs['tanuki_bot_recreate_records_worker'] ||= {}
Settings.cron_jobs['tanuki_bot_recreate_records_worker']['cron'] ||= '0 5 * * 1,2,3,4,5'
Settings.cron_jobs['tanuki_bot_recreate_records_worker']['job_class'] ||= 'Llm::TanukiBot::RecreateRecordsWorker'
Settings.cron_jobs['tanuki_bot_remove_previous_records_worker'] ||= {}
Settings.cron_jobs['tanuki_bot_remove_previous_records_worker']['cron'] ||= '0 0 * * *'
Settings.cron_jobs['tanuki_bot_remove_previous_records_worker']['job_class'] ||= 'Llm::TanukiBot::RemovePreviousRecordsWorker'
Settings.cron_jobs['users_create_statistics_worker'] ||= {}
Settings.cron_jobs['users_create_statistics_worker']['cron'] ||= '2 15 * * *'
Settings.cron_jobs['users_create_statistics_worker']['job_class'] = 'Users::CreateStatisticsWorker'
......
......@@ -311,6 +311,8 @@
- 2
- - llm_completion
- 1
- - llm_tanuki_bot_update
- 1
- - mail_scheduler
- 2
- - mailers
......
......@@ -7,11 +7,31 @@ class TanukiBotMvc < Embedding::ApplicationRecord
has_neighbors :embedding
scope :current, -> { where(version: get_current_version) }
scope :previous, -> { where("version < ?", get_current_version) }
scope :nil_embeddings_for_version, ->(version) { where(version: version, embedding: nil) }
scope :neighbor_for, ->(embedding, limit:, minimum_distance:) do
::Embedding::TanukiBotMvc
.nearest_neighbors(:embedding, embedding, distance: 'inner_product')
.limit(limit)
.select { |n| n.neighbor_distance >= minimum_distance }
end
def self.current_version_cache_key
'tanuki_bot_mvc:version:current'
end
def self.get_current_version
Gitlab::Redis::SharedState.with do |redis|
redis.get(current_version_cache_key)
end.to_i
end
def self.set_current_version!(version)
Gitlab::Redis::SharedState.with do |redis|
redis.set(current_version_cache_key, version.to_i)
end.to_i
end
end
end
......@@ -444,6 +444,24 @@
:weight: 1
:idempotent: false
:tags: []
- :name: cronjob:llm_tanuki_bot_recreate_records
:worker_name: Llm::TanukiBot::RecreateRecordsWorker
:feature_category: :global_search
:has_external_dependencies: false
:urgency: :throttled
:resource_boundary: :unknown
:weight: 1
:idempotent: true
:tags: []
- :name: cronjob:llm_tanuki_bot_remove_previous_records
:worker_name: Llm::TanukiBot::RemovePreviousRecordsWorker
:feature_category: :global_search
:has_external_dependencies: false
:urgency: :throttled
:resource_boundary: :unknown
:weight: 1
:idempotent: true
:tags: []
- :name: cronjob:namespaces_free_user_cap_backfill_notification_clearing_jobs
:worker_name: Namespaces::FreeUserCap::BackfillNotificationClearingJobsWorker
:feature_category: :user_management
......@@ -1443,6 +1461,15 @@
:weight: 1
:idempotent: true
:tags: []
- :name: llm_tanuki_bot_update
:worker_name: Llm::TanukiBot::UpdateWorker
:feature_category: :global_search
:has_external_dependencies: false
:urgency: :throttled
:resource_boundary: :unknown
:weight: 1
:idempotent: true
:tags: []
- :name: merge_request_reset_approvals
:worker_name: MergeRequestResetApprovalsWorker
:feature_category: :source_code_management
......
# frozen_string_literal: true
module Llm
module TanukiBot
class RecreateRecordsWorker
include ApplicationWorker
include CronjobQueue # rubocop:disable Scalability/CronWorkerContext
include Gitlab::ExclusiveLeaseHelpers
idempotent!
data_consistency :always # rubocop: disable SidekiqLoadBalancing/WorkerDataConsistency
feature_category :global_search
urgency :throttled
sidekiq_options retry: 3
DOC_DIRECTORY = 'doc'
FILES_PER_MINUTE = 20
def perform
return unless Feature.enabled?(:openai_experimentation)
return unless Feature.enabled?(:tanuki_bot)
return unless Feature.enabled?(:tanuki_bot_indexing)
return unless ::License.feature_available?(:ai_tanuki_bot)
in_lock("#{self.class.name.underscore}/version/#{version}", ttl: 10.minutes, sleep_sec: 1) do
files.each do |filename|
content = File.read(filename)
filename.gsub!(Rails.root.to_s, '')
items = ::Gitlab::Llm::ContentParser.parse_and_split(content, filename, DOC_DIRECTORY)
items.each do |item|
record = create_record(item)
Llm::TanukiBot::UpdateWorker.perform_in(rand(delay_in_seconds).seconds, record.id, version)
end
end
end
end
private
def files
Dir[Rails.root.join("#{DOC_DIRECTORY}/**/*.md")]
end
def create_record(item)
::Embedding::TanukiBotMvc.create!(
metadata: item[:metadata],
content: item[:content],
url: item[:url],
version: version
)
end
def version
@version ||= ::Embedding::TanukiBotMvc.get_current_version + 1
end
def delay_in_seconds
@delay_in_seconds ||= files.count.to_f / FILES_PER_MINUTE * 60 # minimum 3 seconds
end
end
end
end
# frozen_string_literal: true
module Llm
module TanukiBot
class RemovePreviousRecordsWorker
include ApplicationWorker
include CronjobQueue # rubocop:disable Scalability/CronWorkerContext
idempotent!
data_consistency :always # rubocop: disable SidekiqLoadBalancing/WorkerDataConsistency
feature_category :global_search
urgency :throttled
BATCH_SIZE = 1000
TIME_LIMIT = 3.minutes
def perform
return unless Feature.enabled?(:openai_experimentation)
return unless Feature.enabled?(:tanuki_bot)
return unless Feature.enabled?(:tanuki_bot_indexing)
return unless ::License.feature_available?(:ai_tanuki_bot)
::Embedding::TanukiBotMvc.previous.limit(BATCH_SIZE).delete_all
return unless ::Embedding::TanukiBotMvc.previous.exists?
Llm::TanukiBot::RemovePreviousRecordsWorker.perform_in(10.seconds)
end
end
end
end
# frozen_string_literal: true
module Llm
module TanukiBot
class UpdateWorker
include ApplicationWorker
include Gitlab::ExclusiveLeaseHelpers
idempotent!
data_consistency :delayed
feature_category :global_search
urgency :throttled
sidekiq_options retry: 1
def perform(id, version)
return unless Feature.enabled?(:openai_experimentation)
return unless Feature.enabled?(:tanuki_bot)
return unless Feature.enabled?(:tanuki_bot_indexing)
return unless ::License.feature_available?(:ai_tanuki_bot)
record = ::Embedding::TanukiBotMvc.find_by_id(id)
return unless record
client = ::Gitlab::Llm::OpenAi::Client.new(nil)
result = client.embeddings(input: record.content, moderated: false)
unless result.success? && result.has_key?('data')
raise StandardError, result.dig('error', 'message') || "Could not generate embedding: '#{result}'"
end
embedding = result['data'].first['embedding']
record.update!(embedding: embedding)
return if ::Embedding::TanukiBotMvc.nil_embeddings_for_version(version).exists?
in_lock("#{self.class.name.underscore}/version/#{version}", sleep_sec: 1) do
::Embedding::TanukiBotMvc.set_current_version!(version)
logger.info(
structured_payload(
message: 'Updated current version',
version: version
)
)
end
end
end
end
end
---
name: tanuki_bot_indexing
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/117800
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/410682
milestone: '16.1'
type: development
group: group::global search
default_enabled: false
# frozen_string_literal: true
class AddVersionToTanukiBotMvc < Gitlab::Database::Migration[2.1]
enable_lock_retries!
def up
add_column :tanuki_bot_mvc, :version, :integer, default: 0, null: false
end
def down
remove_column :tanuki_bot_mvc, :version
end
end
# frozen_string_literal: true
class RemoveEmbeddingNotNullFromTanukiBotMvc < Gitlab::Database::Migration[2.1]
def up
change_column_null :tanuki_bot_mvc, :embedding, true
end
def down
# no-op : can't go back to `NULL` without first dropping the `NOT NULL` constraint
end
end
# frozen_string_literal: true
class AddIndexOnVersionToTanukiBotMvc < Gitlab::Database::Migration[2.1]
INDEX_NAME = 'index_tanuki_bot_mvc_on_version'
disable_ddl_transaction!
def up
add_concurrent_index :tanuki_bot_mvc, :version, name: INDEX_NAME
end
def down
remove_concurrent_index_by_name :tanuki_bot_mvc, INDEX_NAME
end
end
# frozen_string_literal: true
class AddIndexOnVersionWhereEmbeddingIsNullToTanukiBotMvc < Gitlab::Database::Migration[2.1]
INDEX_NAME = 'index_tanuki_bot_mvc_on_version_where_embedding_is_null'
disable_ddl_transaction!
def up
add_concurrent_index :tanuki_bot_mvc, :version, where: 'embedding IS NULL', name: INDEX_NAME
end
def down
remove_concurrent_index_by_name :tanuki_bot_mvc, INDEX_NAME
end
end
367b4fc9fce29edfa5af2fb05ac9068686451ac5911b67891307011c33f75449
\ No newline at end of file
b41b536487c231c72c15d33541970bdb12daec37bd28e56f19ccd8a363044ad6
\ No newline at end of file
e36674b18b88a3ecd43e43314f9cd254bd9021955101a576120a36b3dda729f7
\ No newline at end of file
645ef40a92c9ef6a5af8d931eecbff7eb7bed208dd61aa9a5d4705c8aa9a7c9e
\ No newline at end of file
......@@ -15,11 +15,12 @@ CREATE TABLE tanuki_bot_mvc (
id bigint NOT NULL,
created_at timestamp with time zone NOT NULL,
updated_at timestamp with time zone NOT NULL,
embedding vector(1536) NOT NULL,
embedding vector(1536),
url text NOT NULL,
content text NOT NULL,
metadata jsonb NOT NULL,
chroma_id text,
version integer DEFAULT 0 NOT NULL,
CONSTRAINT check_5df597f0fb CHECK ((char_length(url) <= 2048)),
CONSTRAINT check_67053ce605 CHECK ((char_length(content) <= 32768)),
CONSTRAINT check_e130e042d4 CHECK ((char_length(chroma_id) <= 512))
......@@ -46,3 +47,7 @@ ALTER TABLE ONLY tanuki_bot_mvc
ADD CONSTRAINT tanuki_bot_mvc_pkey PRIMARY KEY (id);
CREATE UNIQUE INDEX index_tanuki_bot_mvc_on_chroma_id ON tanuki_bot_mvc USING btree (chroma_id);
CREATE INDEX index_tanuki_bot_mvc_on_version ON tanuki_bot_mvc USING btree (version);
CREATE INDEX index_tanuki_bot_mvc_on_version_where_embedding_is_null ON tanuki_bot_mvc USING btree (version) WHERE (embedding IS NULL);
# frozen_string_literal: true
module Gitlab
module Llm
class ContentParser
include ::Gitlab::Loggable
MAX_CHARS_PER_EMBEDDING = 1500
MIN_CHARS_PER_EMBEDDING = 100
class << self
def parse_and_split(content, source_name, source_type)
items = []
content, metadata, url = parse_content_and_metadata(content, source_name, source_type)
split_by_newline_positions(content) do |text|
next if text.nil?
next unless text.match?(/\w/)
items << {
content: text,
metadata: metadata,
url: url
}
end
items
end
def parse_content_and_metadata(content, source_name, source_type)
metadata = if content.match?(metadata_regex)
metadata = YAML.safe_load(content.match(metadata_regex)[:metadata])
content = content.gsub(metadata_regex, '').strip
metadata
else
{}
end
metadata['title'] = title(content)
metadata['source'] = source_name
metadata['source_type'] = source_type
url = url(source_name, source_type)
[content, metadata, url]
end
def split_by_newline_positions(content)
if content.length < MAX_CHARS_PER_EMBEDDING && content.length >= MIN_CHARS_PER_EMBEDDING
yield content
return
end
positions = content.enum_for(:scan, /\n/).map { Regexp.last_match.begin(0) }
cursor = 0
while position = positions.select { |s| s > cursor && s - cursor <= MAX_CHARS_PER_EMBEDDING }.max
if content[cursor...position].length < MIN_CHARS_PER_EMBEDDING
cursor = position + 1
next
end
yield content[cursor...position]
cursor = position + 1
end
while cursor < content.length
content[cursor...].chars.each_slice(MAX_CHARS_PER_EMBEDDING) do |slice|
if slice.length < MIN_CHARS_PER_EMBEDDING
yield nil
cursor = content.length
next
end
yield slice.join("")
cursor += slice.length
end
end
end
def url(source_name, source_type)
return unless source_name
return unless source_type == 'doc'
page = source_name.gsub('/doc/', '').gsub('.md', '')
::Gitlab::Routing.url_helpers.help_page_url(page)
end
def title(content)
return unless content
match = content.match(/#+\s+(?<title>.+)\n/)
return unless match && match[:title]
match[:title].gsub(/\*\*\(.+\)\*\*$/, '').strip
end
private
def metadata_regex
/\A---$\n(?<metadata>(?<anything>[^\n]|\n)+)---$/
end
end
end
end
end
......@@ -141,7 +141,7 @@ def query_search_documents
embeddings_result = client.embeddings(input: question, moderated: true)
question_embedding = embeddings_result['data'].first['embedding']
::Embedding::TanukiBotMvc.neighbor_for(
::Embedding::TanukiBotMvc.current.neighbor_for(
question_embedding,
limit: RECORD_LIMIT,
minimum_distance: MINIMUM_DISTANCE
......
---
type: reference
group: Unknown
info: Test Information
---
# Heading 1 **(PREMIUM SELF)**
Instructions:
- bullet 1
- bullet 2
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment