From 8f1b9544dc5ac29629d6a1d23a88e6b80c5df739 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Tue, 1 May 2018 11:44:55 -0400 Subject: [PATCH 01/63] Adds method to move tables to CSV with redacted data. --- lib/pseudonymity/table.rb | 55 +++++++++++++++++++++++++++++++++++++++ lib/tasks/gitlab/db.rake | 54 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 lib/pseudonymity/table.rb diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb new file mode 100644 index 000000000000..ebca445ea3bb --- /dev/null +++ b/lib/pseudonymity/table.rb @@ -0,0 +1,55 @@ +require 'digest' +require 'csv' + +module Pseudonymity + class Anon + def initialize(fields) + @anon_fields = fields + end + + def anonymize(results) + + results.collect! do | r | + new_hash = r.each_with_object({}) do | (k, v), h | + if @anon_fields.include? k + h[k] = Digest::SHA2.new(256).hexdigest v + else + h[k] = v + end + end + new_hash + end + end + end + + class Table + class << self + def table_to_csv(table, whitelist_columns, pseudonymity_columns) + sql = "SELECT #{whitelist_columns.join(",")} from #{table}" + results = ActiveRecord::Base.connection.exec_query(sql) + anon = Anon.new(pseudonymity_columns) + results = anon.anonymize results + write_to_csv_file table, results + end + + def write_to_csv_file(title, contents) + file_path = "/tmp/#{title}" + if contents.empty? + File.open(file_path, "w") {} + return file_path + end + column_names = contents.first.keys + contents = CSV.generate do | csv | + csv << column_names + contents.each do |x| + csv << x.values + end + end + File.open(file_path, 'w') { |file| file.write(contents) } + return file_path + end + + private :write_to_csv_file + end + end +end \ No newline at end of file diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index 139ab70e1259..2e490ae5b106 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -69,5 +69,59 @@ namespace :gitlab do Gitlab::DowntimeCheck.new.check_and_print(migrations) end + + desc 'Output pseudonymity dump of selected table' + task :pseudonymity_dump => :environment do + # issue* tables + # label* tables + # licenses + # merge_request* tables + # milestones + # namespace_statistics + # namespaces + # notes + # notification_settings + # project* tables + # subscriptions + # users + + # REMOVE PRODUCTION INFRA SCRIPT AS PART OF MR> + puts Pseudonymity::Table.table_to_csv("approvals", + ["id","merge_request_id","user_id","created_at","updated_at"], + ["id", "merge_request_id", "user_id"]) + puts Pseudonymity::Table.table_to_csv("approver_groups", + ["id","target_type","group_id","created_at","updated_at"], + ["id","group_id"]) + puts Pseudonymity::Table.table_to_csv("board_assignees", + ["id","board_id","assignee_id"], + ["id","board_id","assignee_id"]) + puts Pseudonymity::Table.table_to_csv("board_labels", + ["id","board_id","label_id"], + ["id","board_id","label_id"]) + puts Pseudonymity::Table.table_to_csv("boards", + ["id","project_id","created_at","updated_at","milestone_id","group_id","weight"], + ["id","project_id","milestone_id","group_id"]) + puts Pseudonymity::Table.table_to_csv("epic_issues", + ["id","epic_id","issue_id","relative_position"], + ["id","epic_id","issue_id"]) + puts Pseudonymity::Table.table_to_csv("epic_metrics", + ["id","epic_id","created_at","updated_at"], + ["id"]) + puts Pseudonymity::Table.table_to_csv("epics", + ["id", "milestone_id", "group_id", "author_id", "assignee_id", "iid", "cached_markdown_version", "updated_by_id", "last_edited_by_id", "lock_version", "start_date", "end_date", "last_edited_at", "created_at", "updated_at", "title", "description"], + ["id", "milestone_id", "group_id", "author_id", "assignee_id", "iid", "cached_markdown_version", "updated_by_id", "last_edited_by_id", "lock_version", "start_date", "end_date", "last_edited_at", "created_at", "updated_at"]) + puts Pseudonymity::Table.table_to_csv("issue_assignees", + ["user_id","issue_id"], + ["user_id","issue_id"]) + puts Pseudonymity::Table.table_to_csv("issue_links", + ["id", "source_id", "target_id", "created_at", "updated_at"], + ["id", "source_id", "target_id"]) + puts Pseudonymity::Table.table_to_csv("issue_metrics", + [], + []) + puts Pseudonymity::Table.table_to_csv("issues", + [], + []) + end end end -- GitLab From 89c97fc8564e401801c06f9deefe86acc9fdce12 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Wed, 2 May 2018 08:35:58 -0400 Subject: [PATCH 02/63] Adds all db tables to export to csv. --- lib/pseudonymity/table.rb | 4 +- lib/tasks/gitlab/db.rake | 111 ++++++++++++++++++++++++++++++++------ 2 files changed, 97 insertions(+), 18 deletions(-) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index ebca445ea3bb..9e0faf464de6 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -11,7 +11,7 @@ def anonymize(results) results.collect! do | r | new_hash = r.each_with_object({}) do | (k, v), h | - if @anon_fields.include? k + if @anon_fields.include? k and !v.nil? h[k] = Digest::SHA2.new(256).hexdigest v else h[k] = v @@ -33,7 +33,7 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) end def write_to_csv_file(title, contents) - file_path = "/tmp/#{title}" + file_path = "/tmp/#{title}.csv" if contents.empty? File.open(file_path, "w") {} return file_path diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index 2e490ae5b106..b48ebdedb6c2 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -72,18 +72,6 @@ namespace :gitlab do desc 'Output pseudonymity dump of selected table' task :pseudonymity_dump => :environment do - # issue* tables - # label* tables - # licenses - # merge_request* tables - # milestones - # namespace_statistics - # namespaces - # notes - # notification_settings - # project* tables - # subscriptions - # users # REMOVE PRODUCTION INFRA SCRIPT AS PART OF MR> puts Pseudonymity::Table.table_to_csv("approvals", @@ -117,11 +105,102 @@ namespace :gitlab do ["id", "source_id", "target_id", "created_at", "updated_at"], ["id", "source_id", "target_id"]) puts Pseudonymity::Table.table_to_csv("issue_metrics", - [], - []) + ["id","issue_id","first_mentioned_in_commit_at","first_associated_with_milestone_at","first_added_to_board_at","created_at","updated_at"], + ["id","issue_id"]) puts Pseudonymity::Table.table_to_csv("issues", - [], - []) + ["id","title","author_id","project_id","created_at","updated_at","description","milestone_id","state","updated_by_id","weight","due_date","moved_to_id","lock_version","time_estimate","last_edited_at","last_edited_by_id","discussion_locked","closed_at","closed_by_id"], + ["id","title","author_id","project_id","description","milestone_id","state","updated_by_id","moved_to_id","discussion_locked","closed_at"]) + puts Pseudonymity::Table.table_to_csv("label_links", + ["id","label_id","target_id","target_type","created_at","updated_at"], + ["id","label_id","target_id"]) + puts Pseudonymity::Table.table_to_csv("label_priorities", + ["id","project_id","label_id","priority","created_at","updated_at"], + ["id","project_id","label_id"]) + puts Pseudonymity::Table.table_to_csv("labels", + ["id","title","color","project_id","created_at","updated_at","template","type","group_id"], + ["id","title","color","project_id","created_at","updated_at","template","type","group_id"]) + puts Pseudonymity::Table.table_to_csv("licenses", + ["id","created_at","updated_at"], + ["id"]) + puts Pseudonymity::Table.table_to_csv("licenses", + ["id","created_at","updated_at"], + ["id"]) + puts Pseudonymity::Table.table_to_csv("merge_request_diff_commits", + ["authored_date","committed_date","merge_request_diff_id","relative_order","author_name","author_email","committer_name","committer_email"], + ["merge_request_diff_id","author_name","author_email","committer_name","committer_email"]) + puts Pseudonymity::Table.table_to_csv("merge_request_diff_files", + ["merge_request_diff_id","relative_order","new_file","renamed_file","deleted_file","too_large","a_mode","b_mode"], + ["merge_request_diff_id"]) + puts Pseudonymity::Table.table_to_csv("merge_request_diffs", + ["id","state","merge_request_id","created_at","updated_at","base_commit_sha","real_size","head_commit_sha","start_commit_sha","commits_count"], + ["id","merge_request_id","base_commit_sha","head_commit_sha","start_commit_sha"]) + puts Pseudonymity::Table.table_to_csv("merge_request_metrics", + ["id","merge_request_id","latest_build_started_at","latest_build_finished_at","first_deployed_to_production_at","merged_at","created_at","updated_at","pipeline_id","merged_by_id","latest_closed_by_id","latest_closed_at"], + ["id","merge_request_id","pipeline_id","merged_by_id","latest_closed_by_id"]) + puts Pseudonymity::Table.table_to_csv("merge_requests", + ["id","target_branch","source_branch","source_project_id","author_id","assignee_id","created_at","updated_at","milestone_id","state","merge_status","target_project_id","updated_by_id","merge_error","merge_params","merge_when_pipeline_succeeds","merge_user_id","approvals_before_merge","lock_version","time_estimate","squash","last_edited_at","last_edited_by_id","head_pipeline_id","discussion_locked","latest_merge_request_diff_id","allow_maintainer_to_push"], + ["id","target_branch","source_branch","source_project_id","author_id","assignee_id","milestone_id","target_project_id","updated_by_id","merge_user_id","last_edited_by_id","head_pipeline_id","latest_merge_request_diff_id"]) + puts Pseudonymity::Table.table_to_csv("merge_requests_closing_issues", + ["id","merge_request_id","issue_id","created_at","updated_at"], + ["id","merge_request_id","issue_id"]) + puts Pseudonymity::Table.table_to_csv("milestones", + ["id","project_id","due_date","created_at","updated_at","state","start_date","group_id"], + ["id","project_id","group_id"]) + + puts Pseudonymity::Table.table_to_csv("namespace_statistics", + ["id","namespace_id" ,"shared_runners_seconds","shared_runners_seconds_last_reset"], + ["id","namespace_id" ,"shared_runners_seconds","shared_runners_seconds_last_reset"]) + puts Pseudonymity::Table.table_to_csv("namespaces", + ["id","name","path","owner_id","created_at","updated_at","type","description","avatar","membership_lock","share_with_group_lock","visibility_level","request_access_enabled","ldap_sync_status","ldap_sync_error","ldap_sync_last_update_at","ldap_sync_last_successful_update_at","ldap_sync_last_sync_at","description_html","lfs_enabled","parent_id","shared_runners_minutes_limit","repository_size_limit","require_two_factor_authentication","two_factor_grace_period","cached_markdown_version","plan_id","project_creation_level"], + ["id","name","path","owner_id","created_at","updated_at","type","description","avatar","membership_lock","share_with_group_lock","visibility_level","request_access_enabled","ldap_sync_status","ldap_sync_error","ldap_sync_last_update_at","ldap_sync_last_successful_update_at","ldap_sync_last_sync_at","description_html","lfs_enabled","parent_id","shared_runners_minutes_limit","repository_size_limit","require_two_factor_authentication","two_factor_grace_period","cached_markdown_version","plan_id","project_creation_level"]) + puts Pseudonymity::Table.table_to_csv("notes", + ["id","note","noteable_type","author_id","created_at","updated_at","project_id","attachment","line_code","commit_id","noteable_id","system","st_diff","updated_by_id","type","position","original_position","resolved_at","resolved_by_id","discussion_id","note_html","cached_markdown_version","change_position","resolved_by_push"], + ["id","note","noteable_type","author_id","created_at","updated_at","project_id","attachment","line_code","commit_id","noteable_id","system","st_diff","updated_by_id","type","position","original_position","resolved_at","resolved_by_id","discussion_id","note_html","cached_markdown_version","change_position","resolved_by_push"]) + puts Pseudonymity::Table.table_to_csv("notification_settings", + ["id","user_id","source_id","source_type","level","created_at","updated_at","new_note","new_issue","reopen_issue","close_issue","reassign_issue","new_merge_request","reopen_merge_request","close_merge_request","reassign_merge_request","merge_merge_request","failed_pipeline","success_pipeline","push_to_merge_request","issue_due"], + ["id","user_id","source_id","source_type","level","created_at","updated_at","new_note","new_issue","reopen_issue","close_issue","reassign_issue","new_merge_request","reopen_merge_request","close_merge_request","reassign_merge_request","merge_merge_request","failed_pipeline","success_pipeline","push_to_merge_request","issue_due"]) + puts Pseudonymity::Table.table_to_csv("project_authorizations", + ["user_id","project_id","access_level"], + ["user_id","project_id","access_level"]) + puts Pseudonymity::Table.table_to_csv("project_auto_devops", + ["id","project_id","created_at","updated_at","enabled","domain"], + ["id","project_id","created_at","updated_at","enabled","domain"]) + puts Pseudonymity::Table.table_to_csv("project_ci_cd_settings", + ["id","project_id","group_runners_enabled"], + ["id","project_id","group_runners_enabled"]) + puts Pseudonymity::Table.table_to_csv("project_custom_attributes", + ["id","created_at","updated_at","project_id","key","value"], + ["id","created_at","updated_at","project_id","key","value"]) + puts Pseudonymity::Table.table_to_csv("project_deploy_tokens", + ["id","project_id","deploy_token_id","created_at"], + ["id","project_id","deploy_token_id","created_at"]) + puts Pseudonymity::Table.table_to_csv("project_features", + ["id","project_id","merge_requests_access_level","issues_access_level","wiki_access_level","snippets_access_level","builds_access_level","created_at","updated_at","repository_access_level"], + ["id","project_id","merge_requests_access_level","issues_access_level","wiki_access_level","snippets_access_level","builds_access_level","created_at","updated_at","repository_access_level"]) + puts Pseudonymity::Table.table_to_csv("project_group_links", + ["id","project_id","group_id","created_at","updated_at","group_access","expires_at"], + ["id","project_id","group_id","created_at","updated_at","group_access","expires_at"]) + puts Pseudonymity::Table.table_to_csv("project_import_data", + ["id","project_id","data","encrypted_credentials","encrypted_credentials_iv","encrypted_credentials_salt"], + ["id","project_id","data","encrypted_credentials","encrypted_credentials_iv","encrypted_credentials_salt"]) + puts Pseudonymity::Table.table_to_csv("project_mirror_data", + ["id","project_id","retry_count","last_update_started_at","last_update_scheduled_at","next_execution_timestamp","created_at","updated_at"], + ["id","project_id","retry_count","last_update_started_at","last_update_scheduled_at","next_execution_timestamp","created_at","updated_at"]) + puts Pseudonymity::Table.table_to_csv("project_repository_states", + ["id","project_id","repository_verification_checksum","wiki_verification_checksum","last_repository_verification_failure","last_wiki_verification_failure"], + ["id","project_id","repository_verification_checksum","wiki_verification_checksum","last_repository_verification_failure","last_wiki_verification_failure"]) + puts Pseudonymity::Table.table_to_csv("project_statistics", + ["id","project_id","namespace_id","commit_count","storage_size","repository_size","lfs_objects_size","build_artifacts_size","shared_runners_seconds","shared_runners_seconds_last_reset"], + ["id","project_id","namespace_id","commit_count","storage_size","repository_size","lfs_objects_size","build_artifacts_size","shared_runners_seconds","shared_runners_seconds_last_reset"]) + puts Pseudonymity::Table.table_to_csv("projects", + ["id","name","path","description","created_at","updated_at","creator_id","namespace_id","last_activity_at","import_url","visibility_level","archived","avatar","import_status","merge_requests_template","star_count","merge_requests_rebase_enabled","import_type","import_source","approvals_before_merge","reset_approvals_on_push","merge_requests_ff_only_enabled","issues_template","mirror","mirror_last_update_at","mirror_last_successful_update_at","mirror_user_id","import_error","ci_id","shared_runners_enabled","runners_token","build_coverage_regex","build_allow_git_fetch","build_timeout","mirror_trigger_builds","pending_delete","public_builds","last_repository_check_failed","last_repository_check_at","container_registry_enabled","only_allow_merge_if_pipeline_succeeds","has_external_issue_tracker","repository_storage","repository_read_only","request_access_enabled","has_external_wiki","ci_config_path","lfs_enabled","description_html","only_allow_merge_if_all_discussions_are_resolved","repository_size_limit","printing_merge_request_link_enabled","auto_cancel_pending_pipelines","service_desk_enabled","import_jid","cached_markdown_version","delete_error","last_repository_updated_at","disable_overriding_approvers_per_merge_request","storage_version","resolve_outdated_diff_discussions","remote_mirror_available_overridden","only_mirror_protected_branches","pull_mirror_available_overridden","jobs_cache_index","mirror_overwrites_diverged_branches","external_authorization_classification_label","external_webhook_token","pages_https_only"], + ["id","name","path","description","created_at","updated_at","creator_id","namespace_id","last_activity_at","import_url","visibility_level","archived","avatar","import_status","merge_requests_template","star_count","merge_requests_rebase_enabled","import_type","import_source","approvals_before_merge","reset_approvals_on_push","merge_requests_ff_only_enabled","issues_template","mirror","mirror_last_update_at","mirror_last_successful_update_at","mirror_user_id","import_error","ci_id","shared_runners_enabled","runners_token","build_coverage_regex","build_allow_git_fetch","build_timeout","mirror_trigger_builds","pending_delete","public_builds","last_repository_check_failed","last_repository_check_at","container_registry_enabled","only_allow_merge_if_pipeline_succeeds","has_external_issue_tracker","repository_storage","repository_read_only","request_access_enabled","has_external_wiki","ci_config_path","lfs_enabled","description_html","only_allow_merge_if_all_discussions_are_resolved","repository_size_limit","printing_merge_request_link_enabled","auto_cancel_pending_pipelines","service_desk_enabled","import_jid","cached_markdown_version","delete_error","last_repository_updated_at","disable_overriding_approvers_per_merge_request","storage_version","resolve_outdated_diff_discussions","remote_mirror_available_overridden","only_mirror_protected_branches","pull_mirror_available_overridden","jobs_cache_index","mirror_overwrites_diverged_branches","external_authorization_classification_label","external_webhook_token","pages_https_only"]) + puts Pseudonymity::Table.table_to_csv("subscriptions", + ["id","user_id","subscribable_id","subscribable_type","subscribed","created_at","updated_at","project_id"], + ["id","user_id","subscribable_id","subscribable_type","subscribed","created_at","updated_at","project_id"]) + puts Pseudonymity::Table.table_to_csv("users", + ["id","email","encrypted_password","reset_password_token","reset_password_sent_at","remember_created_at","sign_in_count","current_sign_in_at","last_sign_in_at","current_sign_in_ip","last_sign_in_ip","created_at","updated_at","name","admin","projects_limit","skype","linkedin","twitter","bio","failed_attempts","locked_at","username","can_create_group","can_create_team","state","color_scheme_id","password_expires_at","created_by_id","last_credential_check_at","avatar","confirmation_token","confirmed_at","confirmation_sent_at","unconfirmed_email","hide_no_ssh_key","website_url","admin_email_unsubscribed_at","notification_email","hide_no_password","password_automatically_set","location","encrypted_otp_secret","encrypted_otp_secret_iv","encrypted_otp_secret_salt","otp_required_for_login","otp_backup_codes","public_email","dashboard","project_view","consumed_timestep","layout","hide_project_limit","note","unlock_token","otp_grace_period_started_at","external","incoming_email_token","organization","auditor","require_two_factor_authentication_from_group","two_factor_grace_period","ghost","last_activity_on","notified_of_own_activity","support_bot","preferred_language","rss_token","email_opted_in","email_opted_in_ip","email_opted_in_source_id","email_opted_in_at","theme_id"], + ["id","email","encrypted_password","reset_password_token","reset_password_sent_at","remember_created_at","sign_in_count","current_sign_in_at","last_sign_in_at","current_sign_in_ip","last_sign_in_ip","created_at","updated_at","name","admin","projects_limit","skype","linkedin","twitter","bio","failed_attempts","locked_at","username","can_create_group","can_create_team","state","color_scheme_id","password_expires_at","created_by_id","last_credential_check_at","avatar","confirmation_token","confirmed_at","confirmation_sent_at","unconfirmed_email","hide_no_ssh_key","website_url","admin_email_unsubscribed_at","notification_email","hide_no_password","password_automatically_set","location","encrypted_otp_secret","encrypted_otp_secret_iv","encrypted_otp_secret_salt","otp_required_for_login","otp_backup_codes","public_email","dashboard","project_view","consumed_timestep","layout","hide_project_limit","note","unlock_token","otp_grace_period_started_at","external","incoming_email_token","organization","auditor","require_two_factor_authentication_from_group","two_factor_grace_period","ghost","last_activity_on","notified_of_own_activity","support_bot","preferred_language","rss_token","email_opted_in","email_opted_in_ip","email_opted_in_source_id","email_opted_in_at","theme_id"]) end end end -- GitLab From 0316ea13cae6f9eac63ca884e1683dd5a7def4fa Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Wed, 2 May 2018 16:27:37 -0400 Subject: [PATCH 03/63] Add yml file instead of config. --- lib/assets/pseudonymity_dump.yml | 937 +++++++++++++++++++++++++++++++ lib/pseudonymity/table.rb | 72 ++- lib/tasks/gitlab/db.rake | 130 +---- 3 files changed, 986 insertions(+), 153 deletions(-) create mode 100644 lib/assets/pseudonymity_dump.yml diff --git a/lib/assets/pseudonymity_dump.yml b/lib/assets/pseudonymity_dump.yml new file mode 100644 index 000000000000..6f542920aa64 --- /dev/null +++ b/lib/assets/pseudonymity_dump.yml @@ -0,0 +1,937 @@ +tables: + approvals: + whitelist: + - id + - merge_request_id + - user_id + - created_at + - updated_at + anon: + - id + - merge_request_id + - user_id + approver_groups: + whitelist: + - id + - target_type + - group_id + - created_at + - updated_at + anon: + - id + - group_id + board_assignees: + whitelist: + - id + - board_id + - assignee_id + anon: + - id + - board_id + - assignee_id + board_labels: + whitelist: + - id + - board_id + - label_id + anon: + - id + - board_id + - label_id + boards: + whitelist: + - id + - project_id + - created_at + - updated_at + - milestone_id + - group_id + - weight + anon: + - id + - project_id + - milestone_id + - group_id + epic_issues: + whitelist: + - id + - epic_id + - issue_id + - relative_position + anon: + - id + - epic_id + - issue_id + epic_metrics: + whitelist: + - id + - epic_id + - created_at + - updated_at + anon: + - id + epics: + whitelist: + - id + - milestone_id + - group_id + - author_id + - assignee_id + - iid + - cached_markdown_version + - updated_by_id + - last_edited_by_id + - lock_version + - start_date + - end_date + - last_edited_at + - created_at + - updated_at + - title + - description + anon: + - id + - milestone_id + - group_id + - author_id + - assignee_id + - iid + - cached_markdown_version + - updated_by_id + - last_edited_by_id + - lock_version + - start_date + - end_date + - last_edited_at + - created_at + - updated_at + issue_assignees: + whitelist: + - user_id + - issue_id + anon: + - user_id + - issue_id + issue_links: + whitelist: + - id + - source_id + - target_id + - created_at + - updated_at + anon: + - id + - source_id + - target_id + issue_metrics: + whitelist: + - id + - issue_id + - first_mentioned_in_commit_at + - first_associated_with_milestone_at + - first_added_to_board_at + - created_at + - updated_at + anon: + - id + - issue_id + issues: + whitelist: + - id + - title + - author_id + - project_id + - created_at + - updated_at + - description + - milestone_id + - state + - updated_by_id + - weight + - due_date + - moved_to_id + - lock_version + - time_estimate + - last_edited_at + - last_edited_by_id + - discussion_locked + - closed_at + - closed_by_id + anon: + - id + - title + - author_id + - project_id + - description + - milestone_id + - state + - updated_by_id + - moved_to_id + - discussion_locked + - closed_at + label_links: + whitelist: + - id + - label_id + - target_id + - target_type + - created_at + - updated_at + anon: + - id + - label_id + - target_id + label_priorities: + whitelist: + - id + - project_id + - label_id + - priority + - created_at + - updated_at + anon: + - id + - project_id + - label_id + labels: + whitelist: + - id + - title + - color + - project_id + - created_at + - updated_at + - template + - type + - group_id + anon: + - id + - title + - color + - project_id + - created_at + - updated_at + - template + - type + - group_id + licenses: + whitelist: + - id + - created_at + - updated_at + anon: + - id + merge_request_diff_commits: + whitelist: + - authored_date + - committed_date + - merge_request_diff_id + - relative_order + - author_name + - author_email + - committer_name + - committer_email + anon: + - merge_request_diff_id + - author_name + - author_email + - committer_name + - committer_email + merge_request_diff_files: + whitelist: + - merge_request_diff_id + - relative_order + - new_file + - renamed_file + - deleted_file + - too_large + - a_mode + - b_mode + anon: + - merge_request_diff_id + merge_request_diffs: + whitelist: + - id + - state + - merge_request_id + - created_at + - updated_at + - base_commit_sha + - real_size + - head_commit_sha + - start_commit_sha + - commits_count + anon: + - id + - merge_request_id + - base_commit_sha + - head_commit_sha + - start_commit_sha + merge_request_metrics: + whitelist: + - id + - merge_request_id + - latest_build_started_at + - latest_build_finished_at + - first_deployed_to_production_at + - merged_at + - created_at + - updated_at + - pipeline_id + - merged_by_id + - latest_closed_by_id + - latest_closed_at + anon: + - id + - merge_request_id + - pipeline_id + - merged_by_id + - latest_closed_by_id + merge_requests: + whitelist: + - id + - target_branch + - source_branch + - source_project_id + - author_id + - assignee_id + - created_at + - updated_at + - milestone_id + - state + - merge_status + - target_project_id + - updated_by_id + - merge_error + - merge_params + - merge_when_pipeline_succeeds + - merge_user_id + - approvals_before_merge + - lock_version + - time_estimate + - squash + - last_edited_at + - last_edited_by_id + - head_pipeline_id + - discussion_locked + - latest_merge_request_diff_id + - allow_maintainer_to_push + anon: + - id + - target_branch + - source_branch + - source_project_id + - author_id + - assignee_id + - milestone_id + - target_project_id + - updated_by_id + - merge_user_id + - last_edited_by_id + - head_pipeline_id + - latest_merge_request_diff_id + merge_requests_closing_issues: + whitelist: + - id + - merge_request_id + - issue_id + - created_at + - updated_at + anon: + - id + - merge_request_id + - issue_id + milestones: + whitelist: + - id + - project_id + - due_date + - created_at + - updated_at + - state + - start_date + - group_id + anon: + - id + - project_id + - group_id + namespace_statistics: + whitelist: + - id + - namespace_id + - shared_runners_seconds + - shared_runners_seconds_last_reset + anon: + - id + - namespace_id + namespaces: + whitelist: + - id + - name + - path + - owner_id + - created_at + - updated_at + - type + - avatar + - membership_lock + - share_with_group_lock + - visibility_level + - request_access_enabled + - ldap_sync_status + - ldap_sync_error + - ldap_sync_last_update_at + - ldap_sync_last_successful_update_at + - ldap_sync_last_sync_at + - lfs_enabled + - parent_id + - shared_runners_minutes_limit + - repository_size_limit + - require_two_factor_authentication + - two_factor_grace_period + - plan_id + - project_creation_level + anon: + - id + - name + - path + - owner_id + - type + - avatar + - membership_lock + - share_with_group_lock + - visibility_level + - request_access_enabled + - ldap_sync_status + - ldap_sync_error + - ldap_sync_last_update_at + - ldap_sync_last_successful_update_at + - ldap_sync_last_sync_at + - lfs_enabled + - parent_id + - shared_runners_minutes_limit + - repository_size_limit + - require_two_factor_authentication + - two_factor_grace_period + - plan_id + - project_creation_level + notes: + whitelist: + - id + - note + - noteable_type + - author_id + - created_at + - updated_at + - project_id + - attachment + - line_code + - commit_id + - noteable_id + - system + - st_diff + - updated_by_id + - type + - position + - original_position + - resolved_at + - resolved_by_id + - discussion_id + - note_html + - cached_markdown_version + - change_position + - resolved_by_push + anon: + - id + - note + - noteable_type + - author_id + - created_at + - updated_at + - project_id + - attachment + - line_code + - commit_id + - noteable_id + - system + - st_diff + - updated_by_id + - type + - position + - original_position + - resolved_at + - resolved_by_id + - discussion_id + - note_html + - cached_markdown_version + - change_position + - resolved_by_push + notification_settings: + whitelist: + - id + - user_id + - source_id + - source_type + - level + - created_at + - updated_at + - new_note + - new_issue + - reopen_issue + - close_issue + - reassign_issue + - new_merge_request + - reopen_merge_request + - close_merge_request + - reassign_merge_request + - merge_merge_request + - failed_pipeline + - success_pipeline + - push_to_merge_request + - issue_due + anon: + - id + - user_id + - source_id + - source_type + - level + - created_at + - updated_at + - new_note + - new_issue + - reopen_issue + - close_issue + - reassign_issue + - new_merge_request + - reopen_merge_request + - close_merge_request + - reassign_merge_request + - merge_merge_request + - failed_pipeline + - success_pipeline + - push_to_merge_request + - issue_due + project_authorizations: + whitelist: + - user_id + - project_id + - access_level + anon: + - user_id + - project_id + - access_level + project_auto_devops: + whitelist: + - id + - project_id + - created_at + - updated_at + - enabled + - domain + anon: + - id + - project_id + - created_at + - updated_at + - enabled + - domain + project_ci_cd_settings: + whitelist: + - id + - project_id + - group_runners_enabled + anon: + - id + - project_id + - group_runners_enabled + project_custom_attributes: + whitelist: + - id + - created_at + - updated_at + - project_id + - key + - value + anon: + - id + - created_at + - updated_at + - project_id + - key + - value + project_deploy_tokens: + whitelist: + - id + - project_id + - deploy_token_id + - created_at + anon: + - id + - project_id + - deploy_token_id + - created_at + project_features: + whitelist: + - id + - project_id + - merge_requests_access_level + - issues_access_level + - wiki_access_level + - snippets_access_level + - builds_access_level + - created_at + - updated_at + - repository_access_level + anon: + - id + - project_id + - merge_requests_access_level + - issues_access_level + - wiki_access_level + - snippets_access_level + - builds_access_level + - created_at + - updated_at + - repository_access_level + project_group_links: + whitelist: + - id + - project_id + - group_id + - created_at + - updated_at + - group_access + - expires_at + anon: + - id + - project_id + - group_id + - created_at + - updated_at + - group_access + - expires_at + project_import_data: + whitelist: + - id + - project_id + - data + - encrypted_credentials + - encrypted_credentials_iv + - encrypted_credentials_salt + anon: + - id + - project_id + - data + - encrypted_credentials + - encrypted_credentials_iv + - encrypted_credentials_salt + project_mirror_data: + whitelist: + - id + - project_id + - retry_count + - last_update_started_at + - last_update_scheduled_at + - next_execution_timestamp + - created_at + - updated_at + anon: + - id + - project_id + - retry_count + - last_update_started_at + - last_update_scheduled_at + - next_execution_timestamp + - created_at + - updated_at + project_repository_states: + whitelist: + - id + - project_id + - repository_verification_checksum + - wiki_verification_checksum + - last_repository_verification_failure + - last_wiki_verification_failure + anon: + - id + - project_id + - repository_verification_checksum + - wiki_verification_checksum + - last_repository_verification_failure + - last_wiki_verification_failure + project_statistics: + whitelist: + - id + - project_id + - namespace_id + - commit_count + - storage_size + - repository_size + - lfs_objects_size + - build_artifacts_size + - shared_runners_seconds + - shared_runners_seconds_last_reset + anon: + - id + - project_id + - namespace_id + - commit_count + - storage_size + - repository_size + - lfs_objects_size + - build_artifacts_size + - shared_runners_seconds + - shared_runners_seconds_last_reset + projects: + whitelist: + - id + - name + - path + - description + - created_at + - updated_at + - creator_id + - namespace_id + - last_activity_at + - import_url + - visibility_level + - archived + - avatar + - import_status + - merge_requests_template + - star_count + - merge_requests_rebase_enabled + - import_type + - import_source + - approvals_before_merge + - reset_approvals_on_push + - merge_requests_ff_only_enabled + - issues_template + - mirror + - mirror_last_update_at + - mirror_last_successful_update_at + - mirror_user_id + - import_error + - ci_id + - shared_runners_enabled + - runners_token + - build_coverage_regex + - build_allow_git_fetch + - build_timeout + - mirror_trigger_builds + - pending_delete + - public_builds + - last_repository_check_failed + - last_repository_check_at + - container_registry_enabled + - only_allow_merge_if_pipeline_succeeds + - has_external_issue_tracker + - repository_storage + - repository_read_only + - request_access_enabled + - has_external_wiki + - ci_config_path + - lfs_enabled + - description_html + - only_allow_merge_if_all_discussions_are_resolved + - repository_size_limit + - printing_merge_request_link_enabled + - auto_cancel_pending_pipelines + - service_desk_enabled + - import_jid + - cached_markdown_version + - delete_error + - last_repository_updated_at + - disable_overriding_approvers_per_merge_request + - storage_version + - resolve_outdated_diff_discussions + - remote_mirror_available_overridden + - only_mirror_protected_branches + - pull_mirror_available_overridden + - jobs_cache_index + - mirror_overwrites_diverged_branches + - external_authorization_classification_label + - external_webhook_token + - pages_https_only + anon: + - id + - name + - path + - description + - created_at + - updated_at + - creator_id + - namespace_id + - last_activity_at + - import_url + - visibility_level + - archived + - avatar + - import_status + - merge_requests_template + - star_count + - merge_requests_rebase_enabled + - import_type + - import_source + - approvals_before_merge + - reset_approvals_on_push + - merge_requests_ff_only_enabled + - issues_template + - mirror + - mirror_last_update_at + - mirror_last_successful_update_at + - mirror_user_id + - import_error + - ci_id + - shared_runners_enabled + - runners_token + - build_coverage_regex + - build_allow_git_fetch + - build_timeout + - mirror_trigger_builds + - pending_delete + - public_builds + - last_repository_check_failed + - last_repository_check_at + - container_registry_enabled + - only_allow_merge_if_pipeline_succeeds + - has_external_issue_tracker + - repository_storage + - repository_read_only + - request_access_enabled + - has_external_wiki + - ci_config_path + - lfs_enabled + - description_html + - only_allow_merge_if_all_discussions_are_resolved + - repository_size_limit + - printing_merge_request_link_enabled + - auto_cancel_pending_pipelines + - service_desk_enabled + - import_jid + - cached_markdown_version + - delete_error + - last_repository_updated_at + - disable_overriding_approvers_per_merge_request + - storage_version + - resolve_outdated_diff_discussions + - remote_mirror_available_overridden + - only_mirror_protected_branches + - pull_mirror_available_overridden + - jobs_cache_index + - mirror_overwrites_diverged_branches + - external_authorization_classification_label + - external_webhook_token + - pages_https_only + subscriptions: + whitelist: + - id + - user_id + - subscribable_id + - subscribable_type + - subscribed + - created_at + - updated_at + - project_id + anon: + - id + - user_id + - subscribable_id + - project_id + users: + whitelist: + - id + - email + - remember_created_at + - sign_in_count + - current_sign_in_at + - last_sign_in_at + - current_sign_in_ip + - last_sign_in_ip + - created_at + - updated_at + - name + - admin + - projects_limit + - skype + - linkedin + - twitter + - bio + - failed_attempts + - locked_at + - username + - can_create_group + - can_create_team + - state + - color_scheme_id + - password_expires_at + - created_by_id + - last_credential_check_at + - avatar + - confirmation_token + - confirmed_at + - confirmation_sent_at + - unconfirmed_email + - hide_no_ssh_key + - website_url + - admin_email_unsubscribed_at + - notification_email + - hide_no_password + - password_automatically_set + - location + - public_email + - dashboard + - project_view + - consumed_timestep + - layout + - hide_project_limit + - note + - unlock_token + - otp_grace_period_started_at + - external + - incoming_email_token + - organization + - auditor + - require_two_factor_authentication_from_group + - two_factor_grace_period + - ghost + - last_activity_on + - notified_of_own_activity + - support_bot + - preferred_language + - rss_token + - theme_id + anon: + - id + - email + - remember_created_at + - current_sign_in_ip + - last_sign_in_ip + - name + - admin + - skype + - linkedin + - twitter + - username + - created_by_id + - avatar + - confirmation_token + - unconfirmed_email + - hide_no_ssh_key + - website_url + - notification_email + - location + - public_email + - consumed_timestep + - hide_project_limit + - note + - unlock_token + - otp_grace_period_started_at + - external + - incoming_email_token + - organization + - auditor + - two_factor_grace_period + - ghost + - rss_token + - theme_id \ No newline at end of file diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index 9e0faf464de6..d5e9d8274aba 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -1,5 +1,6 @@ require 'digest' require 'csv' +require 'yaml' module Pseudonymity class Anon @@ -8,8 +9,12 @@ def initialize(fields) end def anonymize(results) - - results.collect! do | r | + count = 0 + results.each do | r | + break if count > 0 + count += 1 + puts r.inspect + puts @anon_fields new_hash = r.each_with_object({}) do | (k, v), h | if @anon_fields.include? k and !v.nil? h[k] = Digest::SHA2.new(256).hexdigest v @@ -23,33 +28,50 @@ def anonymize(results) end class Table - class << self - def table_to_csv(table, whitelist_columns, pseudonymity_columns) - sql = "SELECT #{whitelist_columns.join(",")} from #{table}" - results = ActiveRecord::Base.connection.exec_query(sql) - anon = Anon.new(pseudonymity_columns) - results = anon.anonymize results - write_to_csv_file table, results + + config = {} + + def initialize + parse_config + end + + def tables_to_csv + tables = @config["tables"] + + tables.map do | k, v | + table_to_csv(k, v["whitelist"], v["anon"]) end + end - def write_to_csv_file(title, contents) - file_path = "/tmp/#{title}.csv" - if contents.empty? - File.open(file_path, "w") {} - return file_path - end - column_names = contents.first.keys - contents = CSV.generate do | csv | - csv << column_names - contents.each do |x| - csv << x.values - end - end - File.open(file_path, 'w') { |file| file.write(contents) } + def table_to_csv(table, whitelist_columns, pseudonymity_columns) + sql = "SELECT #{whitelist_columns.join(",")} from #{table}" + results = ActiveRecord::Base.connection.exec_query(sql) + anon = Anon.new(pseudonymity_columns) + results = anon.anonymize results + write_to_csv_file table, results + end + + def parse_config + @config = YAML.load_file('./lib/assets/pseudonymity_dump.yml') + end + + def write_to_csv_file(title, contents) + file_path = "/tmp/#{title}.csv" + if contents.empty? + File.open(file_path, "w") {} return file_path end - - private :write_to_csv_file + column_names = contents.first.keys + contents = CSV.generate do | csv | + csv << column_names + contents.each do |x| + csv << x.values + end + end + File.open(file_path, 'w') { |file| file.write(contents) } + return file_path end + + private :write_to_csv_file end end \ No newline at end of file diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index b48ebdedb6c2..a8468065e3e8 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -72,135 +72,9 @@ namespace :gitlab do desc 'Output pseudonymity dump of selected table' task :pseudonymity_dump => :environment do - + table = Pseudonymity::Table.new # REMOVE PRODUCTION INFRA SCRIPT AS PART OF MR> - puts Pseudonymity::Table.table_to_csv("approvals", - ["id","merge_request_id","user_id","created_at","updated_at"], - ["id", "merge_request_id", "user_id"]) - puts Pseudonymity::Table.table_to_csv("approver_groups", - ["id","target_type","group_id","created_at","updated_at"], - ["id","group_id"]) - puts Pseudonymity::Table.table_to_csv("board_assignees", - ["id","board_id","assignee_id"], - ["id","board_id","assignee_id"]) - puts Pseudonymity::Table.table_to_csv("board_labels", - ["id","board_id","label_id"], - ["id","board_id","label_id"]) - puts Pseudonymity::Table.table_to_csv("boards", - ["id","project_id","created_at","updated_at","milestone_id","group_id","weight"], - ["id","project_id","milestone_id","group_id"]) - puts Pseudonymity::Table.table_to_csv("epic_issues", - ["id","epic_id","issue_id","relative_position"], - ["id","epic_id","issue_id"]) - puts Pseudonymity::Table.table_to_csv("epic_metrics", - ["id","epic_id","created_at","updated_at"], - ["id"]) - puts Pseudonymity::Table.table_to_csv("epics", - ["id", "milestone_id", "group_id", "author_id", "assignee_id", "iid", "cached_markdown_version", "updated_by_id", "last_edited_by_id", "lock_version", "start_date", "end_date", "last_edited_at", "created_at", "updated_at", "title", "description"], - ["id", "milestone_id", "group_id", "author_id", "assignee_id", "iid", "cached_markdown_version", "updated_by_id", "last_edited_by_id", "lock_version", "start_date", "end_date", "last_edited_at", "created_at", "updated_at"]) - puts Pseudonymity::Table.table_to_csv("issue_assignees", - ["user_id","issue_id"], - ["user_id","issue_id"]) - puts Pseudonymity::Table.table_to_csv("issue_links", - ["id", "source_id", "target_id", "created_at", "updated_at"], - ["id", "source_id", "target_id"]) - puts Pseudonymity::Table.table_to_csv("issue_metrics", - ["id","issue_id","first_mentioned_in_commit_at","first_associated_with_milestone_at","first_added_to_board_at","created_at","updated_at"], - ["id","issue_id"]) - puts Pseudonymity::Table.table_to_csv("issues", - ["id","title","author_id","project_id","created_at","updated_at","description","milestone_id","state","updated_by_id","weight","due_date","moved_to_id","lock_version","time_estimate","last_edited_at","last_edited_by_id","discussion_locked","closed_at","closed_by_id"], - ["id","title","author_id","project_id","description","milestone_id","state","updated_by_id","moved_to_id","discussion_locked","closed_at"]) - puts Pseudonymity::Table.table_to_csv("label_links", - ["id","label_id","target_id","target_type","created_at","updated_at"], - ["id","label_id","target_id"]) - puts Pseudonymity::Table.table_to_csv("label_priorities", - ["id","project_id","label_id","priority","created_at","updated_at"], - ["id","project_id","label_id"]) - puts Pseudonymity::Table.table_to_csv("labels", - ["id","title","color","project_id","created_at","updated_at","template","type","group_id"], - ["id","title","color","project_id","created_at","updated_at","template","type","group_id"]) - puts Pseudonymity::Table.table_to_csv("licenses", - ["id","created_at","updated_at"], - ["id"]) - puts Pseudonymity::Table.table_to_csv("licenses", - ["id","created_at","updated_at"], - ["id"]) - puts Pseudonymity::Table.table_to_csv("merge_request_diff_commits", - ["authored_date","committed_date","merge_request_diff_id","relative_order","author_name","author_email","committer_name","committer_email"], - ["merge_request_diff_id","author_name","author_email","committer_name","committer_email"]) - puts Pseudonymity::Table.table_to_csv("merge_request_diff_files", - ["merge_request_diff_id","relative_order","new_file","renamed_file","deleted_file","too_large","a_mode","b_mode"], - ["merge_request_diff_id"]) - puts Pseudonymity::Table.table_to_csv("merge_request_diffs", - ["id","state","merge_request_id","created_at","updated_at","base_commit_sha","real_size","head_commit_sha","start_commit_sha","commits_count"], - ["id","merge_request_id","base_commit_sha","head_commit_sha","start_commit_sha"]) - puts Pseudonymity::Table.table_to_csv("merge_request_metrics", - ["id","merge_request_id","latest_build_started_at","latest_build_finished_at","first_deployed_to_production_at","merged_at","created_at","updated_at","pipeline_id","merged_by_id","latest_closed_by_id","latest_closed_at"], - ["id","merge_request_id","pipeline_id","merged_by_id","latest_closed_by_id"]) - puts Pseudonymity::Table.table_to_csv("merge_requests", - ["id","target_branch","source_branch","source_project_id","author_id","assignee_id","created_at","updated_at","milestone_id","state","merge_status","target_project_id","updated_by_id","merge_error","merge_params","merge_when_pipeline_succeeds","merge_user_id","approvals_before_merge","lock_version","time_estimate","squash","last_edited_at","last_edited_by_id","head_pipeline_id","discussion_locked","latest_merge_request_diff_id","allow_maintainer_to_push"], - ["id","target_branch","source_branch","source_project_id","author_id","assignee_id","milestone_id","target_project_id","updated_by_id","merge_user_id","last_edited_by_id","head_pipeline_id","latest_merge_request_diff_id"]) - puts Pseudonymity::Table.table_to_csv("merge_requests_closing_issues", - ["id","merge_request_id","issue_id","created_at","updated_at"], - ["id","merge_request_id","issue_id"]) - puts Pseudonymity::Table.table_to_csv("milestones", - ["id","project_id","due_date","created_at","updated_at","state","start_date","group_id"], - ["id","project_id","group_id"]) - - puts Pseudonymity::Table.table_to_csv("namespace_statistics", - ["id","namespace_id" ,"shared_runners_seconds","shared_runners_seconds_last_reset"], - ["id","namespace_id" ,"shared_runners_seconds","shared_runners_seconds_last_reset"]) - puts Pseudonymity::Table.table_to_csv("namespaces", - ["id","name","path","owner_id","created_at","updated_at","type","description","avatar","membership_lock","share_with_group_lock","visibility_level","request_access_enabled","ldap_sync_status","ldap_sync_error","ldap_sync_last_update_at","ldap_sync_last_successful_update_at","ldap_sync_last_sync_at","description_html","lfs_enabled","parent_id","shared_runners_minutes_limit","repository_size_limit","require_two_factor_authentication","two_factor_grace_period","cached_markdown_version","plan_id","project_creation_level"], - ["id","name","path","owner_id","created_at","updated_at","type","description","avatar","membership_lock","share_with_group_lock","visibility_level","request_access_enabled","ldap_sync_status","ldap_sync_error","ldap_sync_last_update_at","ldap_sync_last_successful_update_at","ldap_sync_last_sync_at","description_html","lfs_enabled","parent_id","shared_runners_minutes_limit","repository_size_limit","require_two_factor_authentication","two_factor_grace_period","cached_markdown_version","plan_id","project_creation_level"]) - puts Pseudonymity::Table.table_to_csv("notes", - ["id","note","noteable_type","author_id","created_at","updated_at","project_id","attachment","line_code","commit_id","noteable_id","system","st_diff","updated_by_id","type","position","original_position","resolved_at","resolved_by_id","discussion_id","note_html","cached_markdown_version","change_position","resolved_by_push"], - ["id","note","noteable_type","author_id","created_at","updated_at","project_id","attachment","line_code","commit_id","noteable_id","system","st_diff","updated_by_id","type","position","original_position","resolved_at","resolved_by_id","discussion_id","note_html","cached_markdown_version","change_position","resolved_by_push"]) - puts Pseudonymity::Table.table_to_csv("notification_settings", - ["id","user_id","source_id","source_type","level","created_at","updated_at","new_note","new_issue","reopen_issue","close_issue","reassign_issue","new_merge_request","reopen_merge_request","close_merge_request","reassign_merge_request","merge_merge_request","failed_pipeline","success_pipeline","push_to_merge_request","issue_due"], - ["id","user_id","source_id","source_type","level","created_at","updated_at","new_note","new_issue","reopen_issue","close_issue","reassign_issue","new_merge_request","reopen_merge_request","close_merge_request","reassign_merge_request","merge_merge_request","failed_pipeline","success_pipeline","push_to_merge_request","issue_due"]) - puts Pseudonymity::Table.table_to_csv("project_authorizations", - ["user_id","project_id","access_level"], - ["user_id","project_id","access_level"]) - puts Pseudonymity::Table.table_to_csv("project_auto_devops", - ["id","project_id","created_at","updated_at","enabled","domain"], - ["id","project_id","created_at","updated_at","enabled","domain"]) - puts Pseudonymity::Table.table_to_csv("project_ci_cd_settings", - ["id","project_id","group_runners_enabled"], - ["id","project_id","group_runners_enabled"]) - puts Pseudonymity::Table.table_to_csv("project_custom_attributes", - ["id","created_at","updated_at","project_id","key","value"], - ["id","created_at","updated_at","project_id","key","value"]) - puts Pseudonymity::Table.table_to_csv("project_deploy_tokens", - ["id","project_id","deploy_token_id","created_at"], - ["id","project_id","deploy_token_id","created_at"]) - puts Pseudonymity::Table.table_to_csv("project_features", - ["id","project_id","merge_requests_access_level","issues_access_level","wiki_access_level","snippets_access_level","builds_access_level","created_at","updated_at","repository_access_level"], - ["id","project_id","merge_requests_access_level","issues_access_level","wiki_access_level","snippets_access_level","builds_access_level","created_at","updated_at","repository_access_level"]) - puts Pseudonymity::Table.table_to_csv("project_group_links", - ["id","project_id","group_id","created_at","updated_at","group_access","expires_at"], - ["id","project_id","group_id","created_at","updated_at","group_access","expires_at"]) - puts Pseudonymity::Table.table_to_csv("project_import_data", - ["id","project_id","data","encrypted_credentials","encrypted_credentials_iv","encrypted_credentials_salt"], - ["id","project_id","data","encrypted_credentials","encrypted_credentials_iv","encrypted_credentials_salt"]) - puts Pseudonymity::Table.table_to_csv("project_mirror_data", - ["id","project_id","retry_count","last_update_started_at","last_update_scheduled_at","next_execution_timestamp","created_at","updated_at"], - ["id","project_id","retry_count","last_update_started_at","last_update_scheduled_at","next_execution_timestamp","created_at","updated_at"]) - puts Pseudonymity::Table.table_to_csv("project_repository_states", - ["id","project_id","repository_verification_checksum","wiki_verification_checksum","last_repository_verification_failure","last_wiki_verification_failure"], - ["id","project_id","repository_verification_checksum","wiki_verification_checksum","last_repository_verification_failure","last_wiki_verification_failure"]) - puts Pseudonymity::Table.table_to_csv("project_statistics", - ["id","project_id","namespace_id","commit_count","storage_size","repository_size","lfs_objects_size","build_artifacts_size","shared_runners_seconds","shared_runners_seconds_last_reset"], - ["id","project_id","namespace_id","commit_count","storage_size","repository_size","lfs_objects_size","build_artifacts_size","shared_runners_seconds","shared_runners_seconds_last_reset"]) - puts Pseudonymity::Table.table_to_csv("projects", - ["id","name","path","description","created_at","updated_at","creator_id","namespace_id","last_activity_at","import_url","visibility_level","archived","avatar","import_status","merge_requests_template","star_count","merge_requests_rebase_enabled","import_type","import_source","approvals_before_merge","reset_approvals_on_push","merge_requests_ff_only_enabled","issues_template","mirror","mirror_last_update_at","mirror_last_successful_update_at","mirror_user_id","import_error","ci_id","shared_runners_enabled","runners_token","build_coverage_regex","build_allow_git_fetch","build_timeout","mirror_trigger_builds","pending_delete","public_builds","last_repository_check_failed","last_repository_check_at","container_registry_enabled","only_allow_merge_if_pipeline_succeeds","has_external_issue_tracker","repository_storage","repository_read_only","request_access_enabled","has_external_wiki","ci_config_path","lfs_enabled","description_html","only_allow_merge_if_all_discussions_are_resolved","repository_size_limit","printing_merge_request_link_enabled","auto_cancel_pending_pipelines","service_desk_enabled","import_jid","cached_markdown_version","delete_error","last_repository_updated_at","disable_overriding_approvers_per_merge_request","storage_version","resolve_outdated_diff_discussions","remote_mirror_available_overridden","only_mirror_protected_branches","pull_mirror_available_overridden","jobs_cache_index","mirror_overwrites_diverged_branches","external_authorization_classification_label","external_webhook_token","pages_https_only"], - ["id","name","path","description","created_at","updated_at","creator_id","namespace_id","last_activity_at","import_url","visibility_level","archived","avatar","import_status","merge_requests_template","star_count","merge_requests_rebase_enabled","import_type","import_source","approvals_before_merge","reset_approvals_on_push","merge_requests_ff_only_enabled","issues_template","mirror","mirror_last_update_at","mirror_last_successful_update_at","mirror_user_id","import_error","ci_id","shared_runners_enabled","runners_token","build_coverage_regex","build_allow_git_fetch","build_timeout","mirror_trigger_builds","pending_delete","public_builds","last_repository_check_failed","last_repository_check_at","container_registry_enabled","only_allow_merge_if_pipeline_succeeds","has_external_issue_tracker","repository_storage","repository_read_only","request_access_enabled","has_external_wiki","ci_config_path","lfs_enabled","description_html","only_allow_merge_if_all_discussions_are_resolved","repository_size_limit","printing_merge_request_link_enabled","auto_cancel_pending_pipelines","service_desk_enabled","import_jid","cached_markdown_version","delete_error","last_repository_updated_at","disable_overriding_approvers_per_merge_request","storage_version","resolve_outdated_diff_discussions","remote_mirror_available_overridden","only_mirror_protected_branches","pull_mirror_available_overridden","jobs_cache_index","mirror_overwrites_diverged_branches","external_authorization_classification_label","external_webhook_token","pages_https_only"]) - puts Pseudonymity::Table.table_to_csv("subscriptions", - ["id","user_id","subscribable_id","subscribable_type","subscribed","created_at","updated_at","project_id"], - ["id","user_id","subscribable_id","subscribable_type","subscribed","created_at","updated_at","project_id"]) - puts Pseudonymity::Table.table_to_csv("users", - ["id","email","encrypted_password","reset_password_token","reset_password_sent_at","remember_created_at","sign_in_count","current_sign_in_at","last_sign_in_at","current_sign_in_ip","last_sign_in_ip","created_at","updated_at","name","admin","projects_limit","skype","linkedin","twitter","bio","failed_attempts","locked_at","username","can_create_group","can_create_team","state","color_scheme_id","password_expires_at","created_by_id","last_credential_check_at","avatar","confirmation_token","confirmed_at","confirmation_sent_at","unconfirmed_email","hide_no_ssh_key","website_url","admin_email_unsubscribed_at","notification_email","hide_no_password","password_automatically_set","location","encrypted_otp_secret","encrypted_otp_secret_iv","encrypted_otp_secret_salt","otp_required_for_login","otp_backup_codes","public_email","dashboard","project_view","consumed_timestep","layout","hide_project_limit","note","unlock_token","otp_grace_period_started_at","external","incoming_email_token","organization","auditor","require_two_factor_authentication_from_group","two_factor_grace_period","ghost","last_activity_on","notified_of_own_activity","support_bot","preferred_language","rss_token","email_opted_in","email_opted_in_ip","email_opted_in_source_id","email_opted_in_at","theme_id"], - ["id","email","encrypted_password","reset_password_token","reset_password_sent_at","remember_created_at","sign_in_count","current_sign_in_at","last_sign_in_at","current_sign_in_ip","last_sign_in_ip","created_at","updated_at","name","admin","projects_limit","skype","linkedin","twitter","bio","failed_attempts","locked_at","username","can_create_group","can_create_team","state","color_scheme_id","password_expires_at","created_by_id","last_credential_check_at","avatar","confirmation_token","confirmed_at","confirmation_sent_at","unconfirmed_email","hide_no_ssh_key","website_url","admin_email_unsubscribed_at","notification_email","hide_no_password","password_automatically_set","location","encrypted_otp_secret","encrypted_otp_secret_iv","encrypted_otp_secret_salt","otp_required_for_login","otp_backup_codes","public_email","dashboard","project_view","consumed_timestep","layout","hide_project_limit","note","unlock_token","otp_grace_period_started_at","external","incoming_email_token","organization","auditor","require_two_factor_authentication_from_group","two_factor_grace_period","ghost","last_activity_on","notified_of_own_activity","support_bot","preferred_language","rss_token","email_opted_in","email_opted_in_ip","email_opted_in_source_id","email_opted_in_at","theme_id"]) + puts table.tables_to_csv end end end -- GitLab From c88f66e51e2a5799df95815fb40ace3e805ccade Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Wed, 2 May 2018 17:03:08 -0400 Subject: [PATCH 04/63] Update with more efficent looping. --- lib/pseudonymity/table.rb | 21 ++++++++------------- lib/tasks/gitlab/db.rake | 2 -- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index d5e9d8274aba..a9f3e1c4f26a 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -9,21 +9,16 @@ def initialize(fields) end def anonymize(results) - count = 0 - results.each do | r | - break if count > 0 - count += 1 - puts r.inspect - puts @anon_fields - new_hash = r.each_with_object({}) do | (k, v), h | - if @anon_fields.include? k and !v.nil? - h[k] = Digest::SHA2.new(256).hexdigest v - else - h[k] = v - end + columns = results.columns # Assume they all have the same table + to_filter = @anon_fields & columns + + results.each do |result| + to_filter.each do |field| + result[field] = Digest::SHA2.new(256).hexdigest(result[field]) unless result[field].nil? end - new_hash end + + results end end diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index a8468065e3e8..1da20a254a02 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -73,8 +73,6 @@ namespace :gitlab do desc 'Output pseudonymity dump of selected table' task :pseudonymity_dump => :environment do table = Pseudonymity::Table.new - # REMOVE PRODUCTION INFRA SCRIPT AS PART OF MR> - puts table.tables_to_csv end end end -- GitLab From 1ac6a03fcc2fffd6e43caf23d1a34947cab0b19e Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Thu, 3 May 2018 13:33:20 -0400 Subject: [PATCH 05/63] Use enumerators to save memory instead of just plain old `each` --- lib/assets/pseudonymity_dump.yml | 82 ++++++++++++++++---------------- lib/pseudonymity/table.rb | 22 ++++----- lib/tasks/gitlab/db.rake | 1 + 3 files changed, 52 insertions(+), 53 deletions(-) diff --git a/lib/assets/pseudonymity_dump.yml b/lib/assets/pseudonymity_dump.yml index 6f542920aa64..b015d014da39 100644 --- a/lib/assets/pseudonymity_dump.yml +++ b/lib/assets/pseudonymity_dump.yml @@ -6,7 +6,7 @@ tables: - user_id - created_at - updated_at - anon: + pseudo: - id - merge_request_id - user_id @@ -17,7 +17,7 @@ tables: - group_id - created_at - updated_at - anon: + pseudo: - id - group_id board_assignees: @@ -25,7 +25,7 @@ tables: - id - board_id - assignee_id - anon: + pseudo: - id - board_id - assignee_id @@ -34,7 +34,7 @@ tables: - id - board_id - label_id - anon: + pseudo: - id - board_id - label_id @@ -47,7 +47,7 @@ tables: - milestone_id - group_id - weight - anon: + pseudo: - id - project_id - milestone_id @@ -58,7 +58,7 @@ tables: - epic_id - issue_id - relative_position - anon: + pseudo: - id - epic_id - issue_id @@ -68,7 +68,7 @@ tables: - epic_id - created_at - updated_at - anon: + pseudo: - id epics: whitelist: @@ -89,7 +89,7 @@ tables: - updated_at - title - description - anon: + pseudo: - id - milestone_id - group_id @@ -109,7 +109,7 @@ tables: whitelist: - user_id - issue_id - anon: + pseudo: - user_id - issue_id issue_links: @@ -119,7 +119,7 @@ tables: - target_id - created_at - updated_at - anon: + pseudo: - id - source_id - target_id @@ -132,7 +132,7 @@ tables: - first_added_to_board_at - created_at - updated_at - anon: + pseudo: - id - issue_id issues: @@ -157,7 +157,7 @@ tables: - discussion_locked - closed_at - closed_by_id - anon: + pseudo: - id - title - author_id @@ -177,7 +177,7 @@ tables: - target_type - created_at - updated_at - anon: + pseudo: - id - label_id - target_id @@ -189,7 +189,7 @@ tables: - priority - created_at - updated_at - anon: + pseudo: - id - project_id - label_id @@ -204,7 +204,7 @@ tables: - template - type - group_id - anon: + pseudo: - id - title - color @@ -219,7 +219,7 @@ tables: - id - created_at - updated_at - anon: + pseudo: - id merge_request_diff_commits: whitelist: @@ -231,7 +231,7 @@ tables: - author_email - committer_name - committer_email - anon: + pseudo: - merge_request_diff_id - author_name - author_email @@ -247,7 +247,7 @@ tables: - too_large - a_mode - b_mode - anon: + pseudo: - merge_request_diff_id merge_request_diffs: whitelist: @@ -261,7 +261,7 @@ tables: - head_commit_sha - start_commit_sha - commits_count - anon: + pseudo: - id - merge_request_id - base_commit_sha @@ -281,7 +281,7 @@ tables: - merged_by_id - latest_closed_by_id - latest_closed_at - anon: + pseudo: - id - merge_request_id - pipeline_id @@ -316,7 +316,7 @@ tables: - discussion_locked - latest_merge_request_diff_id - allow_maintainer_to_push - anon: + pseudo: - id - target_branch - source_branch @@ -337,7 +337,7 @@ tables: - issue_id - created_at - updated_at - anon: + pseudo: - id - merge_request_id - issue_id @@ -351,7 +351,7 @@ tables: - state - start_date - group_id - anon: + pseudo: - id - project_id - group_id @@ -361,7 +361,7 @@ tables: - namespace_id - shared_runners_seconds - shared_runners_seconds_last_reset - anon: + pseudo: - id - namespace_id namespaces: @@ -391,7 +391,7 @@ tables: - two_factor_grace_period - plan_id - project_creation_level - anon: + pseudo: - id - name - path @@ -441,7 +441,7 @@ tables: - cached_markdown_version - change_position - resolved_by_push - anon: + pseudo: - id - note - noteable_type @@ -489,7 +489,7 @@ tables: - success_pipeline - push_to_merge_request - issue_due - anon: + pseudo: - id - user_id - source_id @@ -516,7 +516,7 @@ tables: - user_id - project_id - access_level - anon: + pseudo: - user_id - project_id - access_level @@ -528,7 +528,7 @@ tables: - updated_at - enabled - domain - anon: + pseudo: - id - project_id - created_at @@ -540,7 +540,7 @@ tables: - id - project_id - group_runners_enabled - anon: + pseudo: - id - project_id - group_runners_enabled @@ -552,7 +552,7 @@ tables: - project_id - key - value - anon: + pseudo: - id - created_at - updated_at @@ -565,7 +565,7 @@ tables: - project_id - deploy_token_id - created_at - anon: + pseudo: - id - project_id - deploy_token_id @@ -582,7 +582,7 @@ tables: - created_at - updated_at - repository_access_level - anon: + pseudo: - id - project_id - merge_requests_access_level @@ -602,7 +602,7 @@ tables: - updated_at - group_access - expires_at - anon: + pseudo: - id - project_id - group_id @@ -618,7 +618,7 @@ tables: - encrypted_credentials - encrypted_credentials_iv - encrypted_credentials_salt - anon: + pseudo: - id - project_id - data @@ -635,7 +635,7 @@ tables: - next_execution_timestamp - created_at - updated_at - anon: + pseudo: - id - project_id - retry_count @@ -652,7 +652,7 @@ tables: - wiki_verification_checksum - last_repository_verification_failure - last_wiki_verification_failure - anon: + pseudo: - id - project_id - repository_verification_checksum @@ -671,7 +671,7 @@ tables: - build_artifacts_size - shared_runners_seconds - shared_runners_seconds_last_reset - anon: + pseudo: - id - project_id - namespace_id @@ -753,7 +753,7 @@ tables: - external_authorization_classification_label - external_webhook_token - pages_https_only - anon: + pseudo: - id - name - path @@ -833,7 +833,7 @@ tables: - created_at - updated_at - project_id - anon: + pseudo: - id - user_id - subscribable_id @@ -901,7 +901,7 @@ tables: - preferred_language - rss_token - theme_id - anon: + pseudo: - id - email - remember_created_at diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index a9f3e1c4f26a..8857ae5161fb 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -12,13 +12,14 @@ def anonymize(results) columns = results.columns # Assume they all have the same table to_filter = @anon_fields & columns - results.each do |result| - to_filter.each do |field| - result[field] = Digest::SHA2.new(256).hexdigest(result[field]) unless result[field].nil? + Enumerator.new do | yielder | + results.each do |result| + to_filter.each do |field| + result[field] = Digest::SHA2.new(256).hexdigest(result[field]) unless result[field].nil? + end + yielder << result end end - - results end end @@ -34,16 +35,17 @@ def tables_to_csv tables = @config["tables"] tables.map do | k, v | - table_to_csv(k, v["whitelist"], v["anon"]) + table_to_csv(k, v["whitelist"], v["pseudo"]) end end def table_to_csv(table, whitelist_columns, pseudonymity_columns) sql = "SELECT #{whitelist_columns.join(",")} from #{table}" results = ActiveRecord::Base.connection.exec_query(sql) + return if results.empty? + anon = Anon.new(pseudonymity_columns) - results = anon.anonymize results - write_to_csv_file table, results + write_to_csv_file(table, anon.anonymize(results)) end def parse_config @@ -52,10 +54,6 @@ def parse_config def write_to_csv_file(title, contents) file_path = "/tmp/#{title}.csv" - if contents.empty? - File.open(file_path, "w") {} - return file_path - end column_names = contents.first.keys contents = CSV.generate do | csv | csv << column_names diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index 1da20a254a02..e81ba775febe 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -73,6 +73,7 @@ namespace :gitlab do desc 'Output pseudonymity dump of selected table' task :pseudonymity_dump => :environment do table = Pseudonymity::Table.new + table.tables_to_csv end end end -- GitLab From e9c28fd01f9f69870912a40feab5f81a5e0f45aa Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Fri, 4 May 2018 11:47:56 -0400 Subject: [PATCH 06/63] Updated config to take destination for files. --- lib/assets/pseudonymity_dump.yml | 2 ++ lib/pseudonymity/table.rb | 15 +++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/lib/assets/pseudonymity_dump.yml b/lib/assets/pseudonymity_dump.yml index b015d014da39..2f38f7829b59 100644 --- a/lib/assets/pseudonymity_dump.yml +++ b/lib/assets/pseudonymity_dump.yml @@ -1,3 +1,5 @@ +output: + csv: '/tmp/' tables: approvals: whitelist: diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index 8857ae5161fb..a3d52a50f3df 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -24,16 +24,19 @@ def anonymize(results) end class Table - - config = {} - def initialize + @config = {} + @csv_output = "" parse_config end def tables_to_csv tables = @config["tables"] - + @csv_output = @config["output"]["csv"] + if not File.directory?(@csv_output) + puts "No such directory #{@csv_output}" + return + end tables.map do | k, v | table_to_csv(k, v["whitelist"], v["pseudo"]) end @@ -43,7 +46,7 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) sql = "SELECT #{whitelist_columns.join(",")} from #{table}" results = ActiveRecord::Base.connection.exec_query(sql) return if results.empty? - + anon = Anon.new(pseudonymity_columns) write_to_csv_file(table, anon.anonymize(results)) end @@ -53,7 +56,7 @@ def parse_config end def write_to_csv_file(title, contents) - file_path = "/tmp/#{title}.csv" + file_path = "#{@csv_output}/#{title}_#{Time.now.to_i}.csv" column_names = contents.first.keys contents = CSV.generate do | csv | csv << column_names -- GitLab From 833480daa66ecfb9daa137df7fdc24241572b69d Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Mon, 7 May 2018 12:31:13 -0400 Subject: [PATCH 07/63] Add tests for pseudo anonymous --- ee/spec/lib/gitlab/pseudo_spec.rb | 62 +++++++++++++++++++++++++++++++ lib/pseudonymity/table.rb | 6 ++- lib/tasks/gitlab/db.rake | 2 +- 3 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 ee/spec/lib/gitlab/pseudo_spec.rb diff --git a/ee/spec/lib/gitlab/pseudo_spec.rb b/ee/spec/lib/gitlab/pseudo_spec.rb new file mode 100644 index 000000000000..afd418e462f2 --- /dev/null +++ b/ee/spec/lib/gitlab/pseudo_spec.rb @@ -0,0 +1,62 @@ +require 'spec_helper' + +describe Gitlab::Pseudonymity do + + let!(:project) { create(:project) } + let(:base_dir) { Dir.mktmpdir } + subject(:pseudo) { Pseudonymity::Table.new } + + after do + FileUtils.rm_rf(base_dir) + end + + # create temp directory in before block + describe 'Pseudo tables' do + it 'outputs project tables to csv' do + pseudo.config["output"]["csv"] = base_dir + pseudo.config["tables"] = { + "projects" => { + "whitelist" => [ + "id", + "name", + "path", + "description" + ], + "pseudo" => [ + "id" + ] + } + } + + expect(pseudo.config["output"]["csv"]).to eq(base_dir) + + # grab the first table it outputs. There would only be 1. + project_table_file = pseudo.tables_to_csv[0] + + # Ignore the `.` and `..` in the directory. + entry = Dir.entries(base_dir)[2] + expect(project_table_file.include? "projects_").to be true + expect(project_table_file.include? ".csv").to be true + expect(project_table_file.include? entry).to be true + columns = [] + project_data = [] + File.foreach(project_table_file).with_index do |line, line_num| + if line_num == 0 + columns = line.split(",") + end + if line_num == 1 + project_data = line.split(",") + break + end + end + + # check if CSV columns are correct + expect(columns.to_set).to eq(["id", "name", "path", "description\n"].to_set) + + # is it pseudonymous + expect(project_data[0]).not_to eq(1) + # sha 256 is 64 chars in length + expect(project_data[0].length).to eq(64) + end + end +end diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index a3d52a50f3df..5074132ca39a 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -24,6 +24,8 @@ def anonymize(results) end class Table + attr_accessor :config + def initialize @config = {} @csv_output = "" @@ -31,8 +33,8 @@ def initialize end def tables_to_csv - tables = @config["tables"] - @csv_output = @config["output"]["csv"] + tables = config["tables"] + @csv_output = config["output"]["csv"] if not File.directory?(@csv_output) puts "No such directory #{@csv_output}" return diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index e81ba775febe..bb8a1417ca3e 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -70,7 +70,7 @@ namespace :gitlab do Gitlab::DowntimeCheck.new.check_and_print(migrations) end - desc 'Output pseudonymity dump of selected table' + desc 'Output pseudonymity dump of selected tables' task :pseudonymity_dump => :environment do table = Pseudonymity::Table.new table.tables_to_csv -- GitLab From 17ddbcbba503c5b372c585fc3c2d55600177026a Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Tue, 8 May 2018 13:38:56 -0400 Subject: [PATCH 08/63] Adds schema file to the output files --- lib/pseudonymity/table.rb | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index 5074132ca39a..b0507b2877e6 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -30,6 +30,7 @@ def initialize @config = {} @csv_output = "" parse_config + @schema = {} end def tables_to_csv @@ -40,19 +41,35 @@ def tables_to_csv return end tables.map do | k, v | + @schema[k] = {} table_to_csv(k, v["whitelist"], v["pseudo"]) end + schema_to_yml + end + + def schema_to_yml + file_path = "#{@csv_output}/schema_#{Time.now.to_i}.yml" + File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) } end def table_to_csv(table, whitelist_columns, pseudonymity_columns) - sql = "SELECT #{whitelist_columns.join(",")} from #{table}" + sql = "SELECT #{whitelist_columns.join(",")} FROM #{table};" + type_sql = "SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '#{table}';" results = ActiveRecord::Base.connection.exec_query(sql) + type_results = ActiveRecord::Base.connection.exec_query(type_sql) + set_schema_column_types(table, type_results) return if results.empty? anon = Anon.new(pseudonymity_columns) write_to_csv_file(table, anon.anonymize(results)) end + def set_schema_column_types(table, type_results) + type_results.each do | type_result | + @schema[table][type_result["column_name"]] = type_result["data_type"] + end + end + def parse_config @config = YAML.load_file('./lib/assets/pseudonymity_dump.yml') end -- GitLab From ec5338166501dad5c87a5446115fb4a6f7fc5e79 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Tue, 8 May 2018 14:03:57 -0400 Subject: [PATCH 09/63] Added mapping key --- lib/pseudonymity/table.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index b0507b2877e6..3107478d4627 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -68,6 +68,8 @@ def set_schema_column_types(table, type_results) type_results.each do | type_result | @schema[table][type_result["column_name"]] = type_result["data_type"] end + # hard coded because all mapping keys in GL are id + @schema[table]["gl_mapping_key"] = "id" end def parse_config -- GitLab From f6774011c22aeda3c6824319985476cbe39e6d49 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Tue, 8 May 2018 14:48:28 -0400 Subject: [PATCH 10/63] Create list of files that are output. --- lib/pseudonymity/table.rb | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index 3107478d4627..21a472abbda7 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -31,11 +31,12 @@ def initialize @csv_output = "" parse_config @schema = {} + @output_files = [] end def tables_to_csv tables = config["tables"] - @csv_output = config["output"]["csv"] + @csv_output = config["output"]["csv"].chomp("\/") if not File.directory?(@csv_output) puts "No such directory #{@csv_output}" return @@ -45,13 +46,20 @@ def tables_to_csv table_to_csv(k, v["whitelist"], v["pseudo"]) end schema_to_yml + file_list_to_json end def schema_to_yml file_path = "#{@csv_output}/schema_#{Time.now.to_i}.yml" + @output_files << file_path File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) } end + def file_list_to_json + file_path = "#{@csv_output}/file_list.json" + File.open(file_path, 'w') { |file| file.write(@output_files.to_json) } + end + def table_to_csv(table, whitelist_columns, pseudonymity_columns) sql = "SELECT #{whitelist_columns.join(",")} FROM #{table};" type_sql = "SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '#{table}';" @@ -86,6 +94,7 @@ def write_to_csv_file(title, contents) end end File.open(file_path, 'w') { |file| file.write(contents) } + @output_files << file_path return file_path end -- GitLab From 07190d8de945c549a6c486718b43b472652f8f30 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Tue, 8 May 2018 15:22:24 -0400 Subject: [PATCH 11/63] Add file list and move file naming to a method --- lib/pseudonymity/table.rb | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index 21a472abbda7..3902afb5f26c 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -36,7 +36,7 @@ def initialize def tables_to_csv tables = config["tables"] - @csv_output = config["output"]["csv"].chomp("\/") + @csv_output = config["output"]["csv"].chomp("\g/") if not File.directory?(@csv_output) puts "No such directory #{@csv_output}" return @@ -49,14 +49,20 @@ def tables_to_csv file_list_to_json end + def get_and_log_file_name(ext, prefix=nil, filename=nil) + file_timestamp = filename || "#{prefix}_#{Time.now.to_i}" + file_timestamp = "#{file_timestamp}.#{ext}" + @output_files << file_timestamp + "#{@csv_output}/#{file_timestamp}" + end + def schema_to_yml - file_path = "#{@csv_output}/schema_#{Time.now.to_i}.yml" - @output_files << file_path + file_path = get_and_log_file_name("yml", "schema") File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) } end def file_list_to_json - file_path = "#{@csv_output}/file_list.json" + file_path = get_and_log_file_name("json", nil, "file_list") File.open(file_path, 'w') { |file| file.write(@output_files.to_json) } end @@ -85,7 +91,7 @@ def parse_config end def write_to_csv_file(title, contents) - file_path = "#{@csv_output}/#{title}_#{Time.now.to_i}.csv" + file_path = get_and_log_file_name("csv", title) column_names = contents.first.keys contents = CSV.generate do | csv | csv << column_names @@ -94,7 +100,6 @@ def write_to_csv_file(title, contents) end end File.open(file_path, 'w') { |file| file.write(contents) } - @output_files << file_path return file_path end -- GitLab From 29761e09af9459f1c684a409223fe11ac0aeabdd Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Tue, 15 May 2018 15:30:01 -0400 Subject: [PATCH 12/63] Add in column change for character varying --- lib/pseudonymity/table.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index 3902afb5f26c..d3ee2c932545 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -80,7 +80,11 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) def set_schema_column_types(table, type_results) type_results.each do | type_result | - @schema[table][type_result["column_name"]] = type_result["data_type"] + data_type = type_result["data_type"] + if @config["tables"][table]["pseudo"].include?(type_result["column_name"]) + data_type = "character varying" + end + @schema[table][type_result["column_name"]] = data_type end # hard coded because all mapping keys in GL are id @schema[table]["gl_mapping_key"] = "id" -- GitLab From 43d0e5e45020bb96903e273168ebb9a6f9989051 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Mon, 21 May 2018 08:29:15 -0400 Subject: [PATCH 13/63] Does HMAC SHA256 with secret instead of sans secret. --- lib/assets/pseudonymity_dump.yml | 2 -- lib/pseudonymity/table.rb | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/assets/pseudonymity_dump.yml b/lib/assets/pseudonymity_dump.yml index 2f38f7829b59..85394f70e41a 100644 --- a/lib/assets/pseudonymity_dump.yml +++ b/lib/assets/pseudonymity_dump.yml @@ -635,8 +635,6 @@ tables: - last_update_started_at - last_update_scheduled_at - next_execution_timestamp - - created_at - - updated_at pseudo: - id - project_id diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index d3ee2c932545..da2fb802297c 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -1,3 +1,4 @@ +require 'OpenSSL' require 'digest' require 'csv' require 'yaml' @@ -15,7 +16,8 @@ def anonymize(results) Enumerator.new do | yielder | results.each do |result| to_filter.each do |field| - result[field] = Digest::SHA2.new(256).hexdigest(result[field]) unless result[field].nil? + secret = Rails.application.secrets[:secret_key_base] + result[field] = OpenSSL::HMAC.hexdigest('SHA256', secret, result[field]) unless result[field].nil? end yielder << result end -- GitLab From eefea6d2162bd7a62b3bb46e4d11f85473056d7b Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Mon, 21 May 2018 12:42:55 -0400 Subject: [PATCH 14/63] Fix review --- lib/pseudonymity/table.rb | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index da2fb802297c..c474ea5231f1 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -38,12 +38,11 @@ def initialize def tables_to_csv tables = config["tables"] - @csv_output = config["output"]["csv"].chomp("\g/") - if not File.directory?(@csv_output) - puts "No such directory #{@csv_output}" - return + @csv_output = config["output"]["csv"] + if !File.directory?(@csv_output) + raise "No such directory #{@csv_output}" end - tables.map do | k, v | + tables.each do | k, v | @schema[k] = {} table_to_csv(k, v["whitelist"], v["pseudo"]) end @@ -55,7 +54,7 @@ def get_and_log_file_name(ext, prefix=nil, filename=nil) file_timestamp = filename || "#{prefix}_#{Time.now.to_i}" file_timestamp = "#{file_timestamp}.#{ext}" @output_files << file_timestamp - "#{@csv_output}/#{file_timestamp}" + File.join(@csv_output, file_timestamp) end def schema_to_yml @@ -70,9 +69,21 @@ def file_list_to_json def table_to_csv(table, whitelist_columns, pseudonymity_columns) sql = "SELECT #{whitelist_columns.join(",")} FROM #{table};" - type_sql = "SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '#{table}';" + # type_sql = "SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '#{table}';" results = ActiveRecord::Base.connection.exec_query(sql) - type_results = ActiveRecord::Base.connection.exec_query(type_sql) + # type_results = ActiveRecord::Base.connection.exec_query(type_sql) + + type_results = ActiveRecord::Base.connection.columns(table) + type_results = type_results.select do |c| + @config["tables"][table]["whitelist"].include?(c.name) + end + type_results = type_results.map do |c| + data_type = c.sql_type + if @config["tables"][table]["pseudo"].include?(c.name) + data_type = "character varying" + end + { name: c.name, data_type: data_type } + end set_schema_column_types(table, type_results) return if results.empty? @@ -82,18 +93,14 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) def set_schema_column_types(table, type_results) type_results.each do | type_result | - data_type = type_result["data_type"] - if @config["tables"][table]["pseudo"].include?(type_result["column_name"]) - data_type = "character varying" - end - @schema[table][type_result["column_name"]] = data_type + @schema[table][type_result[:name]] = type_result[:data_type] end # hard coded because all mapping keys in GL are id @schema[table]["gl_mapping_key"] = "id" end def parse_config - @config = YAML.load_file('./lib/assets/pseudonymity_dump.yml') + @config = YAML.load_file(Rails.root.join('lib/assets/pseudonymity_dump.yml')) end def write_to_csv_file(title, contents) -- GitLab From 9b2160bd00a84fc3c0433be1e09d87a4fdfb3167 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Mon, 21 May 2018 13:44:48 -0400 Subject: [PATCH 15/63] Static analysis pass --- {lib => ee/lib}/assets/pseudonymity_dump.yml | 0 .../{pseudo_spec.rb => pseudonymity_spec.rb} | 18 ++++--------- lib/pseudonymity/table.rb | 26 ++++++++++--------- lib/tasks/gitlab/db.rake | 2 +- 4 files changed, 20 insertions(+), 26 deletions(-) rename {lib => ee/lib}/assets/pseudonymity_dump.yml (100%) rename ee/spec/lib/gitlab/{pseudo_spec.rb => pseudonymity_spec.rb} (84%) diff --git a/lib/assets/pseudonymity_dump.yml b/ee/lib/assets/pseudonymity_dump.yml similarity index 100% rename from lib/assets/pseudonymity_dump.yml rename to ee/lib/assets/pseudonymity_dump.yml diff --git a/ee/spec/lib/gitlab/pseudo_spec.rb b/ee/spec/lib/gitlab/pseudonymity_spec.rb similarity index 84% rename from ee/spec/lib/gitlab/pseudo_spec.rb rename to ee/spec/lib/gitlab/pseudonymity_spec.rb index afd418e462f2..2a86a0a10120 100644 --- a/ee/spec/lib/gitlab/pseudo_spec.rb +++ b/ee/spec/lib/gitlab/pseudonymity_spec.rb @@ -1,7 +1,6 @@ require 'spec_helper' describe Gitlab::Pseudonymity do - let!(:project) { create(:project) } let(:base_dir) { Dir.mktmpdir } subject(:pseudo) { Pseudonymity::Table.new } @@ -16,20 +15,13 @@ pseudo.config["output"]["csv"] = base_dir pseudo.config["tables"] = { "projects" => { - "whitelist" => [ - "id", - "name", - "path", - "description" - ], - "pseudo" => [ - "id" - ] + "whitelist" => %w(id name path description), + "pseudo" => %w(id) } } expect(pseudo.config["output"]["csv"]).to eq(base_dir) - + # grab the first table it outputs. There would only be 1. project_table_file = pseudo.tables_to_csv[0] @@ -44,14 +36,14 @@ if line_num == 0 columns = line.split(",") end + if line_num == 1 project_data = line.split(",") break end end - # check if CSV columns are correct - expect(columns.to_set).to eq(["id", "name", "path", "description\n"].to_set) + expect(columns.to_set).to eq(%W(id name path description\n).to_set) # is it pseudonymous expect(project_data[0]).not_to eq(1) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index c474ea5231f1..cb995ea48c85 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -13,7 +13,7 @@ def anonymize(results) columns = results.columns # Assume they all have the same table to_filter = @anon_fields & columns - Enumerator.new do | yielder | + Enumerator.new do |yielder| results.each do |result| to_filter.each do |field| secret = Rails.application.secrets[:secret_key_base] @@ -39,10 +39,12 @@ def initialize def tables_to_csv tables = config["tables"] @csv_output = config["output"]["csv"] - if !File.directory?(@csv_output) + + unless File.directory?(@csv_output) raise "No such directory #{@csv_output}" end - tables.each do | k, v | + + tables.each do |k, v| @schema[k] = {} table_to_csv(k, v["whitelist"], v["pseudo"]) end @@ -50,7 +52,7 @@ def tables_to_csv file_list_to_json end - def get_and_log_file_name(ext, prefix=nil, filename=nil) + def get_and_log_file_name(ext, prefix = nil, filename = nil) file_timestamp = filename || "#{prefix}_#{Time.now.to_i}" file_timestamp = "#{file_timestamp}.#{ext}" @output_files << file_timestamp @@ -69,19 +71,19 @@ def file_list_to_json def table_to_csv(table, whitelist_columns, pseudonymity_columns) sql = "SELECT #{whitelist_columns.join(",")} FROM #{table};" - # type_sql = "SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '#{table}';" results = ActiveRecord::Base.connection.exec_query(sql) - # type_results = ActiveRecord::Base.connection.exec_query(type_sql) - + type_results = ActiveRecord::Base.connection.columns(table) - type_results = type_results.select do |c| + type_results = type_results.select do |c| @config["tables"][table]["whitelist"].include?(c.name) end type_results = type_results.map do |c| data_type = c.sql_type + if @config["tables"][table]["pseudo"].include?(c.name) data_type = "character varying" end + { name: c.name, data_type: data_type } end set_schema_column_types(table, type_results) @@ -92,7 +94,7 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) end def set_schema_column_types(table, type_results) - type_results.each do | type_result | + type_results.each do |type_result| @schema[table][type_result[:name]] = type_result[:data_type] end # hard coded because all mapping keys in GL are id @@ -106,16 +108,16 @@ def parse_config def write_to_csv_file(title, contents) file_path = get_and_log_file_name("csv", title) column_names = contents.first.keys - contents = CSV.generate do | csv | + contents = CSV.generate do |csv| csv << column_names contents.each do |x| csv << x.values end end File.open(file_path, 'w') { |file| file.write(contents) } - return file_path + file_path end private :write_to_csv_file end -end \ No newline at end of file +end diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index bb8a1417ca3e..d261a59900cb 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -71,7 +71,7 @@ namespace :gitlab do end desc 'Output pseudonymity dump of selected tables' - task :pseudonymity_dump => :environment do + task pseudonymity_dump: :environment do table = Pseudonymity::Table.new table.tables_to_csv end -- GitLab From 121a4c2df5fea916da5692bb71bbb5fd72092a31 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Mon, 21 May 2018 13:49:23 -0400 Subject: [PATCH 16/63] Open ssl case sensitive --- lib/pseudonymity/table.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index cb995ea48c85..289d891dc42f 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -1,4 +1,4 @@ -require 'OpenSSL' +require 'openssl' require 'digest' require 'csv' require 'yaml' -- GitLab From 1481e45ba38861ac5e088026492e48705142ae47 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Mon, 21 May 2018 15:13:14 -0400 Subject: [PATCH 17/63] Move pseudonymity to ee folder --- {lib => ee/lib}/pseudonymity/table.rb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {lib => ee/lib}/pseudonymity/table.rb (100%) diff --git a/lib/pseudonymity/table.rb b/ee/lib/pseudonymity/table.rb similarity index 100% rename from lib/pseudonymity/table.rb rename to ee/lib/pseudonymity/table.rb -- GitLab From e461650ff2e334f997a287eeb6af86710974fbc8 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Wed, 23 May 2018 10:23:52 -0400 Subject: [PATCH 18/63] Fix failing tests. --- ee/lib/pseudonymity/table.rb | 5 +++-- ee/spec/lib/gitlab/pseudonymity_spec.rb | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ee/lib/pseudonymity/table.rb b/ee/lib/pseudonymity/table.rb index 289d891dc42f..cfa09542fb40 100644 --- a/ee/lib/pseudonymity/table.rb +++ b/ee/lib/pseudonymity/table.rb @@ -44,12 +44,13 @@ def tables_to_csv raise "No such directory #{@csv_output}" end - tables.each do |k, v| + new_tables = tables.map do |k, v| @schema[k] = {} table_to_csv(k, v["whitelist"], v["pseudo"]) end schema_to_yml file_list_to_json + new_tables end def get_and_log_file_name(ext, prefix = nil, filename = nil) @@ -102,7 +103,7 @@ def set_schema_column_types(table, type_results) end def parse_config - @config = YAML.load_file(Rails.root.join('lib/assets/pseudonymity_dump.yml')) + @config = YAML.load_file(Rails.root.join('./ee/lib/assets/pseudonymity_dump.yml')) end def write_to_csv_file(title, contents) diff --git a/ee/spec/lib/gitlab/pseudonymity_spec.rb b/ee/spec/lib/gitlab/pseudonymity_spec.rb index 2a86a0a10120..f5533962d03e 100644 --- a/ee/spec/lib/gitlab/pseudonymity_spec.rb +++ b/ee/spec/lib/gitlab/pseudonymity_spec.rb @@ -24,12 +24,12 @@ # grab the first table it outputs. There would only be 1. project_table_file = pseudo.tables_to_csv[0] - # Ignore the `.` and `..` in the directory. + entry = Dir.entries(base_dir)[2] + expect(project_table_file.include? "projects_").to be true expect(project_table_file.include? ".csv").to be true - expect(project_table_file.include? entry).to be true columns = [] project_data = [] File.foreach(project_table_file).with_index do |line, line_num| -- GitLab From fbe1899842ea22d888016b0fda918fb7f00ce100 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Thu, 31 May 2018 18:29:05 -0400 Subject: [PATCH 19/63] Add settings page for enabling gitlab elt database dump. --- .../_elt_database_cron_job.html.haml | 19 ++++++++++++ .../admin/application_settings/show.html.haml | 11 +++++++ app/workers/gitlab_elt_data_dump_worker.rb | 10 ++++++ config/initializers/1_settings.rb | 5 +++ db/schema.rb | 5 ++- .../helpers/ee/application_settings_helper.rb | 3 +- ee/app/models/ee/application_setting.rb | 11 ++++++- ...lt_dump_enabled_to_application_settings.rb | 31 +++++++++++++++++++ ee/spec/lib/gitlab/pseudonymity_spec.rb | 2 -- 9 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 app/views/admin/application_settings/_elt_database_cron_job.html.haml create mode 100644 app/workers/gitlab_elt_data_dump_worker.rb create mode 100644 ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb diff --git a/app/views/admin/application_settings/_elt_database_cron_job.html.haml b/app/views/admin/application_settings/_elt_database_cron_job.html.haml new file mode 100644 index 000000000000..d7c8d5ec04cc --- /dev/null +++ b/app/views/admin/application_settings/_elt_database_cron_job.html.haml @@ -0,0 +1,19 @@ += form_for @application_setting, url: admin_application_settings_path, html: { class: 'fieldset-form' } do |f| + = form_errors(@application_setting) + + %fieldset + .form-group.row + .offset-sm-2.col-sm-10 + - can_be_configured = @application_setting.elt_database_dump_can_be_configured? + .form-check + = f.label :elt_database_dump_enabled do + = f.check_box :elt_database_dump_enabled, disabled: !can_be_configured + Enable ELT Database Cron Job + .form-text.text-muted + - if can_be_configured + GitLab will run a cron job which will send pseudoanonymized data to be processed and analyzed. + - else + The ELT database cron job is disabled. When enabled the cron job will send pseudoanonymized data to be processed and analyzed. + + = f.submit 'Save changes', class: "btn btn-success" + diff --git a/app/views/admin/application_settings/show.html.haml b/app/views/admin/application_settings/show.html.haml index 487dc7c1c55b..c9f374897f24 100644 --- a/app/views/admin/application_settings/show.html.haml +++ b/app/views/admin/application_settings/show.html.haml @@ -237,6 +237,17 @@ .settings-content = render 'usage' +%section.settings.as-usage.no-animate#js-elt-database-dump-settings{ class: ('expanded' if expanded) } + .settings-header + %h4 + = _('ELT Database Cron Job') + %button.btn.btn-default.js-settings-toggle{ type: 'button' } + = expanded ? _('Collapse') : _('Expand') + %p + = _('Enable or disable ELT Database Cron Job.') + .settings-content + = render 'elt_database_cron_job' + %section.settings.as-email.no-animate#js-email-settings{ class: ('expanded' if expanded) } .settings-header %h4 diff --git a/app/workers/gitlab_elt_data_dump_worker.rb b/app/workers/gitlab_elt_data_dump_worker.rb new file mode 100644 index 000000000000..ad1662d7e51f --- /dev/null +++ b/app/workers/gitlab_elt_data_dump_worker.rb @@ -0,0 +1,10 @@ +class GitlabELTDataDumpWorker + include ApplicationWorker + include CronjobQueue + + def perform + return unless Gitlab::CurrentSettings.elt_database_dump_enabled + + Pseudonymity::Table.new.tables_to_csv + end +end diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index 088c06096437..80d6c4f19f99 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -162,6 +162,7 @@ Settings.gitlab['trusted_proxies'] ||= [] Settings.gitlab['no_todos_messages'] ||= YAML.load_file(Rails.root.join('config', 'no_todos_messages.yml')) Settings.gitlab['usage_ping_enabled'] = true if Settings.gitlab['usage_ping_enabled'].nil? +Settings.gitlab['elt_database_dump_enabled'] = true if Settings.gitlab['elt_database_dump_enabled'].nil? # # Elasticseacrh @@ -370,6 +371,10 @@ Settings.cron_jobs['gitlab_usage_ping_worker']['cron'] ||= Settings.__send__(:cron_for_usage_ping) Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWorker' +Settings.cron_jobs['gitlab_elt_database_dump'] ||= Settingslogic.new({}) +Settings.cron_jobs['gitlab_elt_database_dump']['cron'] ||= '0 23 * * *'; +Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabELTDataDumpWorker'; + Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *' Settings.cron_jobs['schedule_update_user_activity_worker']['job_class'] = 'ScheduleUpdateUserActivityWorker' diff --git a/db/schema.rb b/db/schema.rb index d1955f910ee1..ff7756d7ffc6 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -206,6 +206,7 @@ t.string "encrypted_external_auth_client_key_pass_iv" t.string "email_additional_text" t.boolean "enforce_terms", default: false + t.boolean "elt_database_dump_enabled" end create_table "approvals", force: :cascade do |t| @@ -1631,6 +1632,7 @@ t.text "title_html" t.text "description_html" t.integer "time_estimate" + t.boolean "squash", default: false, null: false t.integer "cached_markdown_version" t.datetime "last_edited_at" t.integer "last_edited_by_id" @@ -2020,9 +2022,9 @@ t.datetime "next_execution_timestamp" t.string "status" t.string "jid" + t.text "last_error" t.datetime_with_timezone "last_update_at" t.datetime_with_timezone "last_successful_update_at" - t.text "last_error" end add_index "project_mirror_data", ["jid"], name: "index_project_mirror_data_on_jid", using: :btree @@ -2277,6 +2279,7 @@ end add_index "redirect_routes", ["path"], name: "index_redirect_routes_on_path", unique: true, using: :btree + add_index "redirect_routes", ["path"], name: "index_redirect_routes_on_path_text_pattern_ops", using: :btree, opclasses: {"path"=>"varchar_pattern_ops"} add_index "redirect_routes", ["source_type", "source_id"], name: "index_redirect_routes_on_source_type_and_source_id", using: :btree create_table "releases", force: :cascade do |t| diff --git a/ee/app/helpers/ee/application_settings_helper.rb b/ee/app/helpers/ee/application_settings_helper.rb index 86052c233a3b..c672d32ef6ab 100644 --- a/ee/app/helpers/ee/application_settings_helper.rb +++ b/ee/app/helpers/ee/application_settings_helper.rb @@ -55,7 +55,8 @@ def visible_attributes :slack_app_id, :slack_app_secret, :slack_app_verification_token, - :allow_group_owners_to_manage_ldap + :allow_group_owners_to_manage_ldap, + :elt_database_dump_enabled ] end diff --git a/ee/app/models/ee/application_setting.rb b/ee/app/models/ee/application_setting.rb index ed4cd6dbab13..5c9a1e04640d 100644 --- a/ee/app/models/ee/application_setting.rb +++ b/ee/app/models/ee/application_setting.rb @@ -100,11 +100,20 @@ def defaults slack_app_enabled: false, slack_app_id: nil, slack_app_secret: nil, - slack_app_verification_token: nil + slack_app_verification_token: nil, + elt_database_dump_enabled: Settings.gitlab['elt_database_dump_enabled'], ) end end + def elt_database_dump_can_be_configured? + Settings.gitlab.elt_database_dump_enabled + end + + def elt_database_dump_enabled + elt_database_dump_can_be_configured? && super + end + def should_check_namespace_plan? check_namespace_plan? && (Rails.env.test? || ::Gitlab.dev_env_or_com?) end diff --git a/ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb b/ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb new file mode 100644 index 000000000000..0fb3f68327a2 --- /dev/null +++ b/ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb @@ -0,0 +1,31 @@ +# See http://doc.gitlab.com/ce/development/migration_style_guide.html +# for more information on how to write migrations for GitLab. + +class AddEltDumpEnabledToApplicationSettings < ActiveRecord::Migration + include Gitlab::Database::MigrationHelpers + + # Set this constant to true if this migration requires downtime. + DOWNTIME = false + + # When a migration requires downtime you **must** uncomment the following + # constant and define a short and easy to understand explanation as to why the + # migration requires downtime. + # DOWNTIME_REASON = '' + + # When using the methods "add_concurrent_index", "remove_concurrent_index" or + # "add_column_with_default" you must disable the use of transactions + # as these methods can not run in an existing transaction. + # When using "add_concurrent_index" or "remove_concurrent_index" methods make sure + # that either of them is the _only_ method called in the migration, + # any other changes should go in a separate migration. + # This ensures that upon failure _only_ the index creation or removing fails + # and can be retried or reverted easily. + # + # To disable transactions uncomment the following line and remove these + # comments: + # disable_ddl_transaction! + + def change + add_column :application_settings, :elt_database_dump_enabled, :boolean + end +end diff --git a/ee/spec/lib/gitlab/pseudonymity_spec.rb b/ee/spec/lib/gitlab/pseudonymity_spec.rb index f5533962d03e..82e74aa33c8e 100644 --- a/ee/spec/lib/gitlab/pseudonymity_spec.rb +++ b/ee/spec/lib/gitlab/pseudonymity_spec.rb @@ -26,8 +26,6 @@ project_table_file = pseudo.tables_to_csv[0] # Ignore the `.` and `..` in the directory. - entry = Dir.entries(base_dir)[2] - expect(project_table_file.include? "projects_").to be true expect(project_table_file.include? ".csv").to be true columns = [] -- GitLab From 1595907ae16ffa8bee18b4716f02357d64a5cf1e Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Fri, 1 Jun 2018 12:41:05 -0400 Subject: [PATCH 20/63] Remove db schema --- db/schema.rb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/db/schema.rb b/db/schema.rb index ff7756d7ffc6..d1955f910ee1 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -206,7 +206,6 @@ t.string "encrypted_external_auth_client_key_pass_iv" t.string "email_additional_text" t.boolean "enforce_terms", default: false - t.boolean "elt_database_dump_enabled" end create_table "approvals", force: :cascade do |t| @@ -1632,7 +1631,6 @@ t.text "title_html" t.text "description_html" t.integer "time_estimate" - t.boolean "squash", default: false, null: false t.integer "cached_markdown_version" t.datetime "last_edited_at" t.integer "last_edited_by_id" @@ -2022,9 +2020,9 @@ t.datetime "next_execution_timestamp" t.string "status" t.string "jid" - t.text "last_error" t.datetime_with_timezone "last_update_at" t.datetime_with_timezone "last_successful_update_at" + t.text "last_error" end add_index "project_mirror_data", ["jid"], name: "index_project_mirror_data_on_jid", using: :btree @@ -2279,7 +2277,6 @@ end add_index "redirect_routes", ["path"], name: "index_redirect_routes_on_path", unique: true, using: :btree - add_index "redirect_routes", ["path"], name: "index_redirect_routes_on_path_text_pattern_ops", using: :btree, opclasses: {"path"=>"varchar_pattern_ops"} add_index "redirect_routes", ["source_type", "source_id"], name: "index_redirect_routes_on_source_type_and_source_id", using: :btree create_table "releases", force: :cascade do |t| -- GitLab From 0f52db38830ea6db892268a20050b0fbf6b0a134 Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Fri, 1 Jun 2018 15:11:16 -0400 Subject: [PATCH 21/63] Updated based on feedback. --- .../_elt_database_cron_job.html.haml | 16 ++++++++------ .../admin/application_settings/show.html.haml | 21 ++++++++++--------- config/initializers/1_settings.rb | 2 +- .../helpers/ee/application_settings_helper.rb | 16 ++++++++++++++ ee/app/models/ee/application_setting.rb | 8 +++++-- ee/app/models/license.rb | 3 ++- ee/lib/assets/pseudonymity_dump.yml | 2 -- lib/tasks/gitlab/db.rake | 4 ++++ 8 files changed, 50 insertions(+), 22 deletions(-) diff --git a/app/views/admin/application_settings/_elt_database_cron_job.html.haml b/app/views/admin/application_settings/_elt_database_cron_job.html.haml index d7c8d5ec04cc..4d70988dd5cd 100644 --- a/app/views/admin/application_settings/_elt_database_cron_job.html.haml +++ b/app/views/admin/application_settings/_elt_database_cron_job.html.haml @@ -4,16 +4,20 @@ %fieldset .form-group.row .offset-sm-2.col-sm-10 - - can_be_configured = @application_setting.elt_database_dump_can_be_configured? + - is_enabled = @application_setting.elt_database_dump_enabled? + - is_available = @application_setting.elt_database_dump_available? .form-check = f.label :elt_database_dump_enabled do - = f.check_box :elt_database_dump_enabled, disabled: !can_be_configured - Enable ELT Database Cron Job + = f.check_box :elt_database_dump_enabled, disabled: !is_available + Enable Meltano Database Cron Job .form-text.text-muted - - if can_be_configured - GitLab will run a cron job which will send pseudoanonymized data to be processed and analyzed. + - if is_enabled + = meltano_elt_description_text - else - The ELT database cron job is disabled. When enabled the cron job will send pseudoanonymized data to be processed and analyzed. + - if is_available + = meltano_elt_disabled_description_text + - else + = meltano_elt_unavailable_description_text = f.submit 'Save changes', class: "btn btn-success" diff --git a/app/views/admin/application_settings/show.html.haml b/app/views/admin/application_settings/show.html.haml index c9f374897f24..2beadd64e3d6 100644 --- a/app/views/admin/application_settings/show.html.haml +++ b/app/views/admin/application_settings/show.html.haml @@ -237,16 +237,17 @@ .settings-content = render 'usage' -%section.settings.as-usage.no-animate#js-elt-database-dump-settings{ class: ('expanded' if expanded) } - .settings-header - %h4 - = _('ELT Database Cron Job') - %button.btn.btn-default.js-settings-toggle{ type: 'button' } - = expanded ? _('Collapse') : _('Expand') - %p - = _('Enable or disable ELT Database Cron Job.') - .settings-content - = render 'elt_database_cron_job' +- if meltano_elt_database_dump_enabled? + %section.settings.as-usage.no-animate#js-elt-database-dump-settings{ class: ('expanded' if expanded) } + .settings-header + %h4 + = _('Meltano ELT Database Cron Job') + %button.btn.btn-default.js-settings-toggle{ type: 'button' } + = expanded ? _('Collapse') : _('Expand') + %p + = _('Enable or disable Meltano ELT Database Cron Job.') + .settings-content + = render 'elt_database_cron_job' %section.settings.as-email.no-animate#js-email-settings{ class: ('expanded' if expanded) } .settings-header diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index 80d6c4f19f99..ca0268031126 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -162,7 +162,7 @@ Settings.gitlab['trusted_proxies'] ||= [] Settings.gitlab['no_todos_messages'] ||= YAML.load_file(Rails.root.join('config', 'no_todos_messages.yml')) Settings.gitlab['usage_ping_enabled'] = true if Settings.gitlab['usage_ping_enabled'].nil? -Settings.gitlab['elt_database_dump_enabled'] = true if Settings.gitlab['elt_database_dump_enabled'].nil? +Settings.gitlab['elt_database_dump_enabled'] = false if Settings.gitlab['elt_database_dump_enabled'].nil? # # Elasticseacrh diff --git a/ee/app/helpers/ee/application_settings_helper.rb b/ee/app/helpers/ee/application_settings_helper.rb index c672d32ef6ab..b54a7f8cf60f 100644 --- a/ee/app/helpers/ee/application_settings_helper.rb +++ b/ee/app/helpers/ee/application_settings_helper.rb @@ -35,6 +35,22 @@ def external_authorization_client_pass_help_text "and the value is encrypted at rest.") end + def meltano_elt_database_dump_enabled? + return License.feature_available? :meltano_elt_database_dump + end + + def meltano_elt_description_text + _("GitLab will run the Meltano ELT cron job which will send pseudoanonymized data to be processed and analyzed.") + end + + def meltano_elt_disabled_description_text + _("The Meltano ELT database cron job is disabled. When enabled the cron job will send pseudoanonymized data to be processed and analyzed.") + end + + def meltano_elt_unavailable_description_text + _("The Meltano ELT database cron job is disabled. Once enabled, the cron job will send pseudoanonymized data to be processed and analyzed.") + end + override :visible_attributes def visible_attributes super + [ diff --git a/ee/app/models/ee/application_setting.rb b/ee/app/models/ee/application_setting.rb index 5c9a1e04640d..94041b6d2ee5 100644 --- a/ee/app/models/ee/application_setting.rb +++ b/ee/app/models/ee/application_setting.rb @@ -106,11 +106,15 @@ def defaults end end + def elt_database_dump_available? + License.feature_available? :meltano_elt_database_dump + end + def elt_database_dump_can_be_configured? - Settings.gitlab.elt_database_dump_enabled + Settings.gitlab.elt_database_dump_enabled && License.feature_available?(:meltano_elt_database_dump) end - def elt_database_dump_enabled + def elt_database_dump_enabled? elt_database_dump_can_be_configured? && super end diff --git a/ee/app/models/license.rb b/ee/app/models/license.rb index 2572e12c6cfb..7e619aee9090 100644 --- a/ee/app/models/license.rb +++ b/ee/app/models/license.rb @@ -30,7 +30,8 @@ class License < ActiveRecord::Base related_issues repository_mirrors repository_size_limit - scoped_issue_board + scoped_issue_board, + meltano_elt_database_dump ].freeze EEP_FEATURES = EES_FEATURES + %i[ diff --git a/ee/lib/assets/pseudonymity_dump.yml b/ee/lib/assets/pseudonymity_dump.yml index 85394f70e41a..0c4e5303db92 100644 --- a/ee/lib/assets/pseudonymity_dump.yml +++ b/ee/lib/assets/pseudonymity_dump.yml @@ -1,5 +1,3 @@ -output: - csv: '/tmp/' tables: approvals: whitelist: diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index d261a59900cb..6af87fbd123e 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -46,6 +46,10 @@ namespace :gitlab do desc 'Configures the database by running migrate, or by loading the schema and seeding if needed' task configure: :environment do + unless License.feature_available? :meltano_elt_database_dump + raise "The Meltano ELT extract is not available with this license." + end + if ActiveRecord::Base.connection.tables.any? Rake::Task['db:migrate'].invoke else -- GitLab From 51da573617bf9ef6ab7fbd9da6a0b4a4b63bddf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Mon, 4 Jun 2018 16:26:14 -0400 Subject: [PATCH 22/63] adds the object storage configuration --- app/workers/gitlab_elt_data_dump_worker.rb | 4 +- config/gitlab.yml.example | 26 +++++++ config/initializers/1_settings.rb | 10 ++- .../pseudonymity/manifest.yml | 29 +------- {ee/lib => lib}/pseudonymity/table.rb | 18 ++--- lib/pseudonymity/upload_service.rb | 73 +++++++++++++++++++ lib/tasks/gitlab/db.rake | 15 ++++ 7 files changed, 135 insertions(+), 40 deletions(-) rename ee/lib/assets/pseudonymity_dump.yml => lib/pseudonymity/manifest.yml (97%) rename {ee/lib => lib}/pseudonymity/table.rb (89%) create mode 100644 lib/pseudonymity/upload_service.rb diff --git a/app/workers/gitlab_elt_data_dump_worker.rb b/app/workers/gitlab_elt_data_dump_worker.rb index ad1662d7e51f..2be5d8c6c843 100644 --- a/app/workers/gitlab_elt_data_dump_worker.rb +++ b/app/workers/gitlab_elt_data_dump_worker.rb @@ -1,10 +1,10 @@ -class GitlabELTDataDumpWorker +class GitlabEltDataDumpWorker include ApplicationWorker include CronjobQueue def perform return unless Gitlab::CurrentSettings.elt_database_dump_enabled - + Pseudonymity::Table.new.tables_to_csv end end diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index fc067eb96c33..0646467aafe4 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -726,6 +726,21 @@ production: &base # # Specifies Amazon S3 storage class to use for backups, this is optional # # storage_class: 'STANDARD' + ## Pseudonym exporter + pseudonymizer: + # Tables manifest that specifies the fields to extract and pseudonymize. + # TODO: link to meltano configuration? + manifest: config/pseudonymizer.yml + upload: + # Fog storage connection settings, see http://fog.io/storage/ . + connection: + # provider: AWS + # region: eu-west-1 + # aws_access_key_id: AKIAKIAKI + # aws_secret_access_key: 'secret123' + # # The remote 'directory' to store the CSV files. For S3, this would be the bucket name. + # remote_directory: 'gitlab-elt' + ## GitLab Shell settings gitlab_shell: path: /home/git/gitlab-shell/ @@ -876,6 +891,17 @@ test: token: secret backup: path: tmp/tests/backups + pseudonymizer: + manifest: config/pseudonymizer.test.yml + upload: + # The remote 'directory' to store the CSV files. For S3, this would be the bucket name. + remote_directory: gitlab-elt.test + # Fog storage connection settings, see http://fog.io/storage/ + connection: + provider: AWS + region: us-east-1 + aws_access_key_id: AWS_ACCESS_KEY_ID + aws_secret_access_key: AWS_SECRET_ACCESS_KEY gitlab_shell: path: tmp/tests/gitlab-shell/ hooks_path: tmp/tests/gitlab-shell/hooks/ diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index ca0268031126..cc0e3bd17511 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -373,7 +373,7 @@ Settings.cron_jobs['gitlab_elt_database_dump'] ||= Settingslogic.new({}) Settings.cron_jobs['gitlab_elt_database_dump']['cron'] ||= '0 23 * * *'; -Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabELTDataDumpWorker'; +Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabEltDataDumpWorker'; Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *' @@ -475,6 +475,14 @@ Settings.backup['upload']['encryption'] ||= nil Settings.backup['upload']['storage_class'] ||= nil +# +# Pseudonymizer +# +Settings['pseudonymizer'] ||= Settingslogic.new({}) +Settings.pseudonymizer['manifest'] = Settings.pseudonymizer['manifest'] || "lib/pseudonymity/manifest.yml" +Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil }) +# Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600 + # # Git # diff --git a/ee/lib/assets/pseudonymity_dump.yml b/lib/pseudonymity/manifest.yml similarity index 97% rename from ee/lib/assets/pseudonymity_dump.yml rename to lib/pseudonymity/manifest.yml index 0c4e5303db92..9ff62ab0106e 100644 --- a/ee/lib/assets/pseudonymity_dump.yml +++ b/lib/pseudonymity/manifest.yml @@ -156,7 +156,6 @@ tables: - last_edited_by_id - discussion_locked - closed_at - - closed_by_id pseudo: - id - title @@ -487,8 +486,6 @@ tables: - merge_merge_request - failed_pipeline - success_pipeline - - push_to_merge_request - - issue_due pseudo: - id - user_id @@ -509,8 +506,6 @@ tables: - merge_merge_request - failed_pipeline - success_pipeline - - push_to_merge_request - - issue_due project_authorizations: whitelist: - user_id @@ -535,15 +530,6 @@ tables: - updated_at - enabled - domain - project_ci_cd_settings: - whitelist: - - id - - project_id - - group_runners_enabled - pseudo: - - id - - project_id - - group_runners_enabled project_custom_attributes: whitelist: - id @@ -559,17 +545,6 @@ tables: - project_id - key - value - project_deploy_tokens: - whitelist: - - id - - project_id - - deploy_token_id - - created_at - pseudo: - - id - - project_id - - deploy_token_id - - created_at project_features: whitelist: - id @@ -750,7 +725,6 @@ tables: - mirror_overwrites_diverged_branches - external_authorization_classification_label - external_webhook_token - - pages_https_only pseudo: - id - name @@ -820,7 +794,6 @@ tables: - mirror_overwrites_diverged_branches - external_authorization_classification_label - external_webhook_token - - pages_https_only subscriptions: whitelist: - id @@ -932,4 +905,4 @@ tables: - two_factor_grace_period - ghost - rss_token - - theme_id \ No newline at end of file + - theme_id diff --git a/ee/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb similarity index 89% rename from ee/lib/pseudonymity/table.rb rename to lib/pseudonymity/table.rb index cfa09542fb40..4c99bcd68bde 100644 --- a/ee/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -27,27 +27,26 @@ def anonymize(results) class Table attr_accessor :config + attr_accessor :output_dir def initialize - @config = {} - @csv_output = "" - parse_config + @config = parse_config + @output_dir = "" @schema = {} @output_files = [] end def tables_to_csv tables = config["tables"] - @csv_output = config["output"]["csv"] - unless File.directory?(@csv_output) - raise "No such directory #{@csv_output}" - end + @output_dir = File.join("/tmp/", SecureRandom.hex) + Dir.mkdir(@output_dir) unless File.directory?(@output_dir) new_tables = tables.map do |k, v| @schema[k] = {} table_to_csv(k, v["whitelist"], v["pseudo"]) end + schema_to_yml file_list_to_json new_tables @@ -57,7 +56,7 @@ def get_and_log_file_name(ext, prefix = nil, filename = nil) file_timestamp = filename || "#{prefix}_#{Time.now.to_i}" file_timestamp = "#{file_timestamp}.#{ext}" @output_files << file_timestamp - File.join(@csv_output, file_timestamp) + File.join(@output_dir, file_timestamp) end def schema_to_yml @@ -103,10 +102,11 @@ def set_schema_column_types(table, type_results) end def parse_config - @config = YAML.load_file(Rails.root.join('./ee/lib/assets/pseudonymity_dump.yml')) + YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) end def write_to_csv_file(title, contents) + Rails.logger.info "Writing #{title} ..." file_path = get_and_log_file_name("csv", title) column_names = contents.first.keys contents = CSV.generate do |csv| diff --git a/lib/pseudonymity/upload_service.rb b/lib/pseudonymity/upload_service.rb new file mode 100644 index 000000000000..3f13f13dbd65 --- /dev/null +++ b/lib/pseudonymity/upload_service.rb @@ -0,0 +1,73 @@ +module Pseudonymity + class UploadService + RemoteStorageUnavailableError = Class.new(StandardError) + + def initialize(output_dir, progress) + @progress = progress + @output_dir = output_dir + end + + def upload + progress.print "Uploading backup archive to remote storage #{remote_directory} ... " + + file_list.each do |file| + upload_file(file, remote_directory) + end + end + + def upload_file(file, directory) + progress.print "\tUploading #{file} ... " + if directory.files.create(key: File.basename(file), body: File.open(file), public: false) + progress.puts "done".color(:green) + else + puts "uploading CSV to #{remote_directory} failed".color(:red) + end + end + + def cleanup + progress.print "Deleting tmp directory #{@output_dir} ... " + return unless File.exist?(@output_dir) + + if FileUtils.rm_rf(@output_dir) + progress.puts "done".color(:green) + else + progress.puts "failed".color(:red) + end + end + + private + + def config + Gitlab.config.pseudonymizer + end + + def remote_directory + connection_settings = config.upload.connection + if connection_settings.blank? + progress.puts "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red) + raise RemoteStorageUnavailableError.new(connection_settings) + end + + connect_to_remote_directory(connection_settings) + end + + def connect_to_remote_directory(connection_settings) + # our settings use string keys, but Fog expects symbols + connection = ::Fog::Storage.new(connection_settings.symbolize_keys) + remote_dir = config.upload.remote_directory + + # We only attempt to create the directory for local backups. For AWS + # and other cloud providers, we cannot guarantee the user will have + # permission to create the bucket. + if connection.service == ::Fog::Storage::Local + connection.directories.create(key: remote_dir) + else + connection.directories.get(remote_dir) + end + end + + def file_list + Dir[File.join(@output_dir, "*.{csv,yml}")] + end + end +end diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index 6af87fbd123e..5febfc7f894e 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -78,6 +78,21 @@ namespace :gitlab do task pseudonymity_dump: :environment do table = Pseudonymity::Table.new table.tables_to_csv + + upload = Pseudonymity::UploadService.new(table.output_dir, progress) + upload.upload + upload.cleanup + end + + def progress + if ENV['CRON'] + # We need an object we can say 'puts' and 'print' to; let's use a + # StringIO. + require 'stringio' + StringIO.new + else + $stdout + end end end end -- GitLab From beed849dffae6801eb4601805d2a989e2e4848b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Tue, 5 Jun 2018 16:51:49 +0000 Subject: [PATCH 23/63] wip: upload completed. --- app/workers/gitlab_elt_data_dump_worker.rb | 12 +- db/schema.rb | 6 +- lib/pseudonymity/options.rb | 19 +++ lib/pseudonymity/table.rb | 15 +-- lib/pseudonymity/upload_service.rb | 20 +-- lib/tasks/gitlab/db.rake | 9 +- rakes.patch | 126 ++++++++++++++++++ .../gitlab => spec/lib}/pseudonymity_spec.rb | 0 8 files changed, 184 insertions(+), 23 deletions(-) create mode 100644 lib/pseudonymity/options.rb create mode 100644 rakes.patch rename {ee/spec/lib/gitlab => spec/lib}/pseudonymity_spec.rb (100%) diff --git a/app/workers/gitlab_elt_data_dump_worker.rb b/app/workers/gitlab_elt_data_dump_worker.rb index 2be5d8c6c843..3cb80a8b1790 100644 --- a/app/workers/gitlab_elt_data_dump_worker.rb +++ b/app/workers/gitlab_elt_data_dump_worker.rb @@ -5,6 +5,16 @@ class GitlabEltDataDumpWorker def perform return unless Gitlab::CurrentSettings.elt_database_dump_enabled - Pseudonymity::Table.new.tables_to_csv + options = Pseudonymity::Options.new( + config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), + start_at: Time.now.utc + ) + + table = Pseudonymity::Table.new(options) + table.tables_to_csv + + upload = Pseudonymity::UploadService.new(options) + upload.upload + upload.cleanup end end diff --git a/db/schema.rb b/db/schema.rb index d1955f910ee1..f666fca74167 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -197,8 +197,8 @@ t.string "external_authorization_service_url" t.string "external_authorization_service_default_label" t.boolean "pages_domain_verification_enabled", default: true, null: false - t.float "external_authorization_service_timeout", default: 0.5, null: false t.boolean "allow_local_requests_from_hooks_and_services", default: false, null: false + t.float "external_authorization_service_timeout", default: 0.5 t.text "external_auth_client_cert" t.text "encrypted_external_auth_client_key" t.string "encrypted_external_auth_client_key_iv" @@ -206,6 +206,7 @@ t.string "encrypted_external_auth_client_key_pass_iv" t.string "email_additional_text" t.boolean "enforce_terms", default: false + t.boolean "elt_database_dump_enabled" end create_table "approvals", force: :cascade do |t| @@ -1631,6 +1632,7 @@ t.text "title_html" t.text "description_html" t.integer "time_estimate" + t.boolean "squash", default: false, null: false t.integer "cached_markdown_version" t.datetime "last_edited_at" t.integer "last_edited_by_id" @@ -2020,9 +2022,9 @@ t.datetime "next_execution_timestamp" t.string "status" t.string "jid" + t.text "last_error" t.datetime_with_timezone "last_update_at" t.datetime_with_timezone "last_successful_update_at" - t.text "last_error" end add_index "project_mirror_data", ["jid"], name: "index_project_mirror_data_on_jid", using: :btree diff --git a/lib/pseudonymity/options.rb b/lib/pseudonymity/options.rb new file mode 100644 index 000000000000..e2fbeb504009 --- /dev/null +++ b/lib/pseudonymity/options.rb @@ -0,0 +1,19 @@ +module Pseudonymity + class Options + attr_reader :config + attr_reader :start_at + + def initialize(config: {}, start_at: Time.now.utc) + @config = config + @start_at = start_at + end + + def output_dir + File.join('/tmp', 'gitlab-pseudonymizer', self.start_at.iso8601) + end + + def upload_dir + File.join(self.start_at.iso8601) + end + end +end diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymity/table.rb index 4c99bcd68bde..4f70b65a7126 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymity/table.rb @@ -27,11 +27,11 @@ def anonymize(results) class Table attr_accessor :config - attr_accessor :output_dir - def initialize - @config = parse_config - @output_dir = "" + def initialize(options) + @config = options.config + @output_dir = options.output_dir + @schema = {} @output_files = [] end @@ -39,8 +39,7 @@ def initialize def tables_to_csv tables = config["tables"] - @output_dir = File.join("/tmp/", SecureRandom.hex) - Dir.mkdir(@output_dir) unless File.directory?(@output_dir) + FileUtils.mkdir_p(@output_dir) unless File.directory?(@output_dir) new_tables = tables.map do |k, v| @schema[k] = {} @@ -101,10 +100,6 @@ def set_schema_column_types(table, type_results) @schema[table]["gl_mapping_key"] = "id" end - def parse_config - YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) - end - def write_to_csv_file(title, contents) Rails.logger.info "Writing #{title} ..." file_path = get_and_log_file_name("csv", title) diff --git a/lib/pseudonymity/upload_service.rb b/lib/pseudonymity/upload_service.rb index 3f13f13dbd65..8f621c2f9bb4 100644 --- a/lib/pseudonymity/upload_service.rb +++ b/lib/pseudonymity/upload_service.rb @@ -2,13 +2,14 @@ module Pseudonymity class UploadService RemoteStorageUnavailableError = Class.new(StandardError) - def initialize(output_dir, progress) - @progress = progress - @output_dir = output_dir + def initialize(options, progress = nil) + @progress = progress || $stdout + @output_dir = options.output_dir + @upload_dir = options.upload_dir end def upload - progress.print "Uploading backup archive to remote storage #{remote_directory} ... " + progress.puts "Uploading output files to remote storage #{remote_directory} ... " file_list.each do |file| upload_file(file, remote_directory) @@ -16,11 +17,14 @@ def upload end def upload_file(file, directory) - progress.print "\tUploading #{file} ... " - if directory.files.create(key: File.basename(file), body: File.open(file), public: false) + progress.print "\t#{file} ... " + + if directory.files.create(key: File.join(@upload_dir, File.basename(file)), + body: File.open(file), + public: false) progress.puts "done".color(:green) else - puts "uploading CSV to #{remote_directory} failed".color(:red) + progress.puts "uploading CSV to #{remote_directory} failed".color(:red) end end @@ -67,7 +71,7 @@ def connect_to_remote_directory(connection_settings) end def file_list - Dir[File.join(@output_dir, "*.{csv,yml}")] + Dir[File.join(@output_dir, "*")] end end end diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index 5febfc7f894e..ba972120a1d9 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -76,10 +76,15 @@ namespace :gitlab do desc 'Output pseudonymity dump of selected tables' task pseudonymity_dump: :environment do - table = Pseudonymity::Table.new + options = Pseudonymity::Options.new( + config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), + start_at: Time.now.utc + ) + + table = Pseudonymity::Table.new(options) table.tables_to_csv - upload = Pseudonymity::UploadService.new(table.output_dir, progress) + upload = Pseudonymity::UploadService.new(options, progress) upload.upload upload.cleanup end diff --git a/rakes.patch b/rakes.patch new file mode 100644 index 000000000000..a587beb2a07c --- /dev/null +++ b/rakes.patch @@ -0,0 +1,126 @@ +diff --git a/lib/tasks/gitlab/artifacts/migrate.rake b/lib/tasks/gitlab/artifacts/migrate.rake +new file mode 100644 +index 00000000000..bfca4bfb3f7 +--- /dev/null ++++ b/lib/tasks/gitlab/artifacts/migrate.rake +@@ -0,0 +1,25 @@ ++require 'logger' ++require 'resolv-replace' ++ ++desc "GitLab | Migrate files for artifacts to comply with new storage format" ++namespace :gitlab do ++ namespace :artifacts do ++ task migrate: :environment do ++ logger = Logger.new(STDOUT) ++ logger.info('Starting transfer of artifacts') ++ ++ Ci::Build.joins(:project) ++ .with_artifacts_stored_locally ++ .find_each(batch_size: 10) do |build| ++ begin ++ build.artifacts_file.migrate!(ObjectStorage::Store::REMOTE) ++ build.artifacts_metadata.migrate!(ObjectStorage::Store::REMOTE) ++ ++ logger.info("Transferred artifacts of #{build.id} of #{build.artifacts_size} to object storage") ++ rescue => e ++ logger.error("Failed to transfer artifacts of #{build.id} with error: #{e.message}") ++ end ++ end ++ end ++ end ++end +diff --git a/lib/tasks/gitlab/exclusive_lease.rake b/lib/tasks/gitlab/exclusive_lease.rake +new file mode 100644 +index 00000000000..83722bf6d94 +--- /dev/null ++++ b/lib/tasks/gitlab/exclusive_lease.rake +@@ -0,0 +1,9 @@ ++namespace :gitlab do ++ namespace :exclusive_lease do ++ desc 'GitLab | Clear existing exclusive leases for specified scope (default: *)' ++ task :clear, [:scope] => [:environment] do |_, args| ++ args[:scope].nil? ? Gitlab::ExclusiveLease.reset_all! : Gitlab::ExclusiveLease.reset_all!(args[:scope]) ++ puts 'All exclusive lease entries were removed.' ++ end ++ end ++end +diff --git a/lib/tasks/gitlab/lfs/migrate.rake b/lib/tasks/gitlab/lfs/migrate.rake +new file mode 100644 +index 00000000000..a45e5ca91e0 +--- /dev/null ++++ b/lib/tasks/gitlab/lfs/migrate.rake +@@ -0,0 +1,22 @@ ++require 'logger' ++ ++desc "GitLab | Migrate LFS objects to remote storage" ++namespace :gitlab do ++ namespace :lfs do ++ task migrate: :environment do ++ logger = Logger.new(STDOUT) ++ logger.info('Starting transfer of LFS files to object storage') ++ ++ LfsObject.with_files_stored_locally ++ .find_each(batch_size: 10) do |lfs_object| ++ begin ++ lfs_object.file.migrate!(LfsObjectUploader::Store::REMOTE) ++ ++ logger.info("Transferred LFS object #{lfs_object.oid} of size #{lfs_object.size.to_i.bytes} to object storage") ++ rescue => e ++ logger.error("Failed to transfer LFS object #{lfs_object.oid} with error: #{e.message}") ++ end ++ end ++ end ++ end ++end +diff --git a/lib/tasks/gitlab/shell.rake b/lib/tasks/gitlab/shell.rake +index 1ce2eedb89c..56f3a916c87 100644 +--- a/lib/tasks/gitlab/shell.rake ++++ b/lib/tasks/gitlab/shell.rake +@@ -69,7 +69,7 @@ namespace :gitlab do + if File.exist?(path_to_repo) + print '-' + else +- if Gitlab::Shell.new.add_repository(project.repository_storage, ++ if Gitlab::Shell.new.create_repository(project.repository_storage, + project.disk_path) + print '.' + else +diff --git a/lib/tasks/gitlab/uploads/migrate.rake b/lib/tasks/gitlab/uploads/migrate.rake +new file mode 100644 +index 00000000000..c26c3ccb3be +--- /dev/null ++++ b/lib/tasks/gitlab/uploads/migrate.rake +@@ -0,0 +1,33 @@ ++namespace :gitlab do ++ namespace :uploads do ++ desc 'GitLab | Uploads | Migrate the uploaded files to object storage' ++ task :migrate, [:uploader_class, :model_class, :mounted_as] => :environment do |task, args| ++ batch_size = ENV.fetch('BATCH', 200).to_i ++ @to_store = ObjectStorage::Store::REMOTE ++ @mounted_as = args.mounted_as&.gsub(':', '')&.to_sym ++ @uploader_class = args.uploader_class.constantize ++ @model_class = args.model_class.constantize ++ ++ uploads.each_batch(of: batch_size, &method(:enqueue_batch)) # rubocop: disable Cop/InBatches ++ end ++ ++ def enqueue_batch(batch, index) ++ job = ObjectStorage::MigrateUploadsWorker.enqueue!(batch, ++ @mounted_as, ++ @to_store) ++ puts "Enqueued job ##{index}: #{job}" ++ rescue ObjectStorage::MigrateUploadsWorker::SanityCheckError => e ++ # continue for the next batch ++ puts "Could not enqueue batch (#{batch.ids}) #{e.message}".color(:red) ++ end ++ ++ def uploads ++ Upload.class_eval { include EachBatch } unless Upload < EachBatch ++ ++ Upload ++ .where.not(store: @to_store) ++ .where(uploader: @uploader_class.to_s, ++ model_type: @model_class.base_class.sti_name) ++ end ++ end ++end diff --git a/ee/spec/lib/gitlab/pseudonymity_spec.rb b/spec/lib/pseudonymity_spec.rb similarity index 100% rename from ee/spec/lib/gitlab/pseudonymity_spec.rb rename to spec/lib/pseudonymity_spec.rb -- GitLab From 97be319ff585ab9001b92ad23247ac65beaaa823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Wed, 6 Jun 2018 09:03:27 -0400 Subject: [PATCH 24/63] reworking the configuration --- .../_elt_database_cron_job.html.haml | 23 ------------------- .../admin/application_settings/show.html.haml | 8 +++---- ...dump_worker.rb => pseudonymizer_worker.rb} | 4 ++-- config/gitlab.yml.example | 16 ++++++++----- config/initializers/1_settings.rb | 2 +- db/schema.rb | 3 ++- .../helpers/ee/application_settings_helper.rb | 16 +++++-------- ee/app/models/ee/application_setting.rb | 14 +++++------ ee/app/models/license.rb | 4 ++-- ...lt_dump_enabled_to_application_settings.rb | 2 +- lib/tasks/gitlab/db.rake | 12 ++++++---- 11 files changed, 42 insertions(+), 62 deletions(-) delete mode 100644 app/views/admin/application_settings/_elt_database_cron_job.html.haml rename app/workers/{gitlab_elt_data_dump_worker.rb => pseudonymizer_worker.rb} (80%) diff --git a/app/views/admin/application_settings/_elt_database_cron_job.html.haml b/app/views/admin/application_settings/_elt_database_cron_job.html.haml deleted file mode 100644 index 4d70988dd5cd..000000000000 --- a/app/views/admin/application_settings/_elt_database_cron_job.html.haml +++ /dev/null @@ -1,23 +0,0 @@ -= form_for @application_setting, url: admin_application_settings_path, html: { class: 'fieldset-form' } do |f| - = form_errors(@application_setting) - - %fieldset - .form-group.row - .offset-sm-2.col-sm-10 - - is_enabled = @application_setting.elt_database_dump_enabled? - - is_available = @application_setting.elt_database_dump_available? - .form-check - = f.label :elt_database_dump_enabled do - = f.check_box :elt_database_dump_enabled, disabled: !is_available - Enable Meltano Database Cron Job - .form-text.text-muted - - if is_enabled - = meltano_elt_description_text - - else - - if is_available - = meltano_elt_disabled_description_text - - else - = meltano_elt_unavailable_description_text - - = f.submit 'Save changes', class: "btn btn-success" - diff --git a/app/views/admin/application_settings/show.html.haml b/app/views/admin/application_settings/show.html.haml index 2beadd64e3d6..22246f2aa5f4 100644 --- a/app/views/admin/application_settings/show.html.haml +++ b/app/views/admin/application_settings/show.html.haml @@ -237,17 +237,17 @@ .settings-content = render 'usage' -- if meltano_elt_database_dump_enabled? +- if License.feature_available?(:pseudonymizer) %section.settings.as-usage.no-animate#js-elt-database-dump-settings{ class: ('expanded' if expanded) } .settings-header %h4 - = _('Meltano ELT Database Cron Job') + = _('Pseudonymizer Cron Job') %button.btn.btn-default.js-settings-toggle{ type: 'button' } = expanded ? _('Collapse') : _('Expand') %p - = _('Enable or disable Meltano ELT Database Cron Job.') + = _('Enable or disable the Pseudonymizer Cron Job.') .settings-content - = render 'elt_database_cron_job' + = render 'pseudonymizer' %section.settings.as-email.no-animate#js-email-settings{ class: ('expanded' if expanded) } .settings-header diff --git a/app/workers/gitlab_elt_data_dump_worker.rb b/app/workers/pseudonymizer_worker.rb similarity index 80% rename from app/workers/gitlab_elt_data_dump_worker.rb rename to app/workers/pseudonymizer_worker.rb index 3cb80a8b1790..6296e4c12d44 100644 --- a/app/workers/gitlab_elt_data_dump_worker.rb +++ b/app/workers/pseudonymizer_worker.rb @@ -1,9 +1,9 @@ -class GitlabEltDataDumpWorker +class PseudonymizerWorker include ApplicationWorker include CronjobQueue def perform - return unless Gitlab::CurrentSettings.elt_database_dump_enabled + return unless Gitlab::CurrentSettings.pseudonymizer_enabled? options = Pseudonymity::Options.new( config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index 0646467aafe4..dd39534b5890 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -263,7 +263,7 @@ production: &base # Remove outdated repository archives repository_archive_cache_worker: cron: "0 * * * *" - + # Verify custom GitLab Pages domains pages_domain_verification_cron_worker: cron: "*/15 * * * *" @@ -311,6 +311,9 @@ production: &base geo_migrated_local_files_clean_up_worker: cron: "15 */6 * * *" + pseudonymizer_worker: + cron: "0 * * * *" + registry: # enabled: true # host: registry.example.com @@ -726,11 +729,12 @@ production: &base # # Specifies Amazon S3 storage class to use for backups, this is optional # # storage_class: 'STANDARD' - ## Pseudonym exporter + ## Pseudonymizer exporter pseudonymizer: + enabled: false # Tables manifest that specifies the fields to extract and pseudonymize. - # TODO: link to meltano configuration? manifest: config/pseudonymizer.yml + # remote_directory: 'gitlab-elt' upload: # Fog storage connection settings, see http://fog.io/storage/ . connection: @@ -739,7 +743,6 @@ production: &base # aws_access_key_id: AKIAKIAKI # aws_secret_access_key: 'secret123' # # The remote 'directory' to store the CSV files. For S3, this would be the bucket name. - # remote_directory: 'gitlab-elt' ## GitLab Shell settings gitlab_shell: @@ -892,6 +895,7 @@ test: backup: path: tmp/tests/backups pseudonymizer: + enabled: false manifest: config/pseudonymizer.test.yml upload: # The remote 'directory' to store the CSV files. For S3, this would be the bucket name. @@ -900,8 +904,8 @@ test: connection: provider: AWS region: us-east-1 - aws_access_key_id: AWS_ACCESS_KEY_ID - aws_secret_access_key: AWS_SECRET_ACCESS_KEY + aws_access_key_id: minio + aws_secret_access_key: gdk-minio gitlab_shell: path: tmp/tests/gitlab-shell/ hooks_path: tmp/tests/gitlab-shell/hooks/ diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index cc0e3bd17511..f8b4f4408df3 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -162,7 +162,6 @@ Settings.gitlab['trusted_proxies'] ||= [] Settings.gitlab['no_todos_messages'] ||= YAML.load_file(Rails.root.join('config', 'no_todos_messages.yml')) Settings.gitlab['usage_ping_enabled'] = true if Settings.gitlab['usage_ping_enabled'].nil? -Settings.gitlab['elt_database_dump_enabled'] = false if Settings.gitlab['elt_database_dump_enabled'].nil? # # Elasticseacrh @@ -479,6 +478,7 @@ # Pseudonymizer # Settings['pseudonymizer'] ||= Settingslogic.new({}) +Settings.pseudonymizer['enabled'] = false if Settings.pseudonymizer['enabled'].nil? Settings.pseudonymizer['manifest'] = Settings.pseudonymizer['manifest'] || "lib/pseudonymity/manifest.yml" Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil }) # Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600 diff --git a/db/schema.rb b/db/schema.rb index f666fca74167..c1b871a90e2f 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -206,7 +206,7 @@ t.string "encrypted_external_auth_client_key_pass_iv" t.string "email_additional_text" t.boolean "enforce_terms", default: false - t.boolean "elt_database_dump_enabled" + t.boolean "pseudonymizer_enabled" end create_table "approvals", force: :cascade do |t| @@ -2279,6 +2279,7 @@ end add_index "redirect_routes", ["path"], name: "index_redirect_routes_on_path", unique: true, using: :btree + add_index "redirect_routes", ["path"], name: "index_redirect_routes_on_path_text_pattern_ops", using: :btree, opclasses: {"path"=>"varchar_pattern_ops"} add_index "redirect_routes", ["source_type", "source_id"], name: "index_redirect_routes_on_source_type_and_source_id", using: :btree create_table "releases", force: :cascade do |t| diff --git a/ee/app/helpers/ee/application_settings_helper.rb b/ee/app/helpers/ee/application_settings_helper.rb index b54a7f8cf60f..7fb08dcdc90c 100644 --- a/ee/app/helpers/ee/application_settings_helper.rb +++ b/ee/app/helpers/ee/application_settings_helper.rb @@ -35,20 +35,16 @@ def external_authorization_client_pass_help_text "and the value is encrypted at rest.") end - def meltano_elt_database_dump_enabled? - return License.feature_available? :meltano_elt_database_dump + def pseudonymizer_description_text + _("GitLab will run the pseudonymizer cron job which will send pseudoanonymized data to be processed and analyzed.") end - def meltano_elt_description_text - _("GitLab will run the Meltano ELT cron job which will send pseudoanonymized data to be processed and analyzed.") + def pseudonymizer_disabled_description_text + _("The pseudonymizer database cron job is disabled. When enabled the cron job will send pseudoanonymized data to be processed and analyzed.") end - def meltano_elt_disabled_description_text - _("The Meltano ELT database cron job is disabled. When enabled the cron job will send pseudoanonymized data to be processed and analyzed.") - end - - def meltano_elt_unavailable_description_text - _("The Meltano ELT database cron job is disabled. Once enabled, the cron job will send pseudoanonymized data to be processed and analyzed.") + def pseudonymizer_unavailable_description_text + _("The pseudonymizer cron job is disabled. Once enabled, the cron job will send pseudoanonymized data to be processed and analyzed.") end override :visible_attributes diff --git a/ee/app/models/ee/application_setting.rb b/ee/app/models/ee/application_setting.rb index 94041b6d2ee5..69558d61620a 100644 --- a/ee/app/models/ee/application_setting.rb +++ b/ee/app/models/ee/application_setting.rb @@ -101,21 +101,21 @@ def defaults slack_app_id: nil, slack_app_secret: nil, slack_app_verification_token: nil, - elt_database_dump_enabled: Settings.gitlab['elt_database_dump_enabled'], + pseudonymizer_enabled: Settings.pseudonymizer['enabled'], ) end end - def elt_database_dump_available? - License.feature_available? :meltano_elt_database_dump + def pseudonymizer_available? + License.feature_available?(:pseudonymizer) end - def elt_database_dump_can_be_configured? - Settings.gitlab.elt_database_dump_enabled && License.feature_available?(:meltano_elt_database_dump) + def pseudonymizer_can_be_configured? + Settings.pseudonymizer.enabled && pseudonymizer_available? end - def elt_database_dump_enabled? - elt_database_dump_can_be_configured? && super + def pseudonymizer_enabled? + pseudonymizer_can_be_configured? && super end def should_check_namespace_plan? diff --git a/ee/app/models/license.rb b/ee/app/models/license.rb index 7e619aee9090..5e327b373c31 100644 --- a/ee/app/models/license.rb +++ b/ee/app/models/license.rb @@ -30,8 +30,8 @@ class License < ActiveRecord::Base related_issues repository_mirrors repository_size_limit - scoped_issue_board, - meltano_elt_database_dump + scoped_issue_board + pseudonymizer ].freeze EEP_FEATURES = EES_FEATURES + %i[ diff --git a/ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb b/ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb index 0fb3f68327a2..fcf254ac8883 100644 --- a/ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb +++ b/ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb @@ -26,6 +26,6 @@ class AddEltDumpEnabledToApplicationSettings < ActiveRecord::Migration # disable_ddl_transaction! def change - add_column :application_settings, :elt_database_dump_enabled, :boolean + add_column :application_settings, :pseudonymizer_enabled, :boolean end end diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index ba972120a1d9..b98a01b2a137 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -46,10 +46,6 @@ namespace :gitlab do desc 'Configures the database by running migrate, or by loading the schema and seeding if needed' task configure: :environment do - unless License.feature_available? :meltano_elt_database_dump - raise "The Meltano ELT extract is not available with this license." - end - if ActiveRecord::Base.connection.tables.any? Rake::Task['db:migrate'].invoke else @@ -75,7 +71,13 @@ namespace :gitlab do end desc 'Output pseudonymity dump of selected tables' - task pseudonymity_dump: :environment do + task pseudonymizer: :environment do + unless License.feature_available? :pseudonymizer + raise "The pseudonymizer is not available with this license." + end + + abort "Pseudonymizer disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? + options = Pseudonymity::Options.new( config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), start_at: Time.now.utc -- GitLab From 3aef220438965c3e77527345d0acf7aec8fa67b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Thu, 7 Jun 2018 16:23:14 -0400 Subject: [PATCH 25/63] squash: before rebase --- db/schema.rb | 15 +-- ...ymizer_enabled_to_application_settings.rb} | 2 +- rakes.patch | 126 ------------------ 3 files changed, 5 insertions(+), 138 deletions(-) rename ee/db/migrate/{20180531221734_add_elt_dump_enabled_to_application_settings.rb => 20180531221734_add_pseudonymizer_enabled_to_application_settings.rb} (94%) delete mode 100644 rakes.patch diff --git a/db/schema.rb b/db/schema.rb index c1b871a90e2f..99ac13e428f7 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -197,8 +197,8 @@ t.string "external_authorization_service_url" t.string "external_authorization_service_default_label" t.boolean "pages_domain_verification_enabled", default: true, null: false + t.float "external_authorization_service_timeout", default: 0.5, null: false t.boolean "allow_local_requests_from_hooks_and_services", default: false, null: false - t.float "external_authorization_service_timeout", default: 0.5 t.text "external_auth_client_cert" t.text "encrypted_external_auth_client_key" t.string "encrypted_external_auth_client_key_iv" @@ -549,12 +549,10 @@ t.integer "config_source" t.boolean "protected" t.integer "failure_reason" - t.integer "iid" end add_index "ci_pipelines", ["auto_canceled_by_id"], name: "index_ci_pipelines_on_auto_canceled_by_id", using: :btree add_index "ci_pipelines", ["pipeline_schedule_id"], name: "index_ci_pipelines_on_pipeline_schedule_id", using: :btree - add_index "ci_pipelines", ["project_id", "iid"], name: "index_ci_pipelines_on_project_id_and_iid", unique: true, where: "(iid IS NOT NULL)", using: :btree add_index "ci_pipelines", ["project_id", "ref", "status", "id"], name: "index_ci_pipelines_on_project_id_and_ref_and_status_and_id", using: :btree add_index "ci_pipelines", ["project_id", "sha"], name: "index_ci_pipelines_on_project_id_and_sha", using: :btree add_index "ci_pipelines", ["project_id"], name: "index_ci_pipelines_on_project_id", using: :btree @@ -1632,7 +1630,6 @@ t.text "title_html" t.text "description_html" t.integer "time_estimate" - t.boolean "squash", default: false, null: false t.integer "cached_markdown_version" t.datetime "last_edited_at" t.integer "last_edited_by_id" @@ -2022,9 +2019,9 @@ t.datetime "next_execution_timestamp" t.string "status" t.string "jid" - t.text "last_error" t.datetime_with_timezone "last_update_at" t.datetime_with_timezone "last_successful_update_at" + t.text "last_error" end add_index "project_mirror_data", ["jid"], name: "index_project_mirror_data_on_jid", using: :btree @@ -2279,7 +2276,6 @@ end add_index "redirect_routes", ["path"], name: "index_redirect_routes_on_path", unique: true, using: :btree - add_index "redirect_routes", ["path"], name: "index_redirect_routes_on_path_text_pattern_ops", using: :btree, opclasses: {"path"=>"varchar_pattern_ops"} add_index "redirect_routes", ["source_type", "source_id"], name: "index_redirect_routes_on_source_type_and_source_id", using: :btree create_table "releases", force: :cascade do |t| @@ -2677,13 +2673,13 @@ t.boolean "notified_of_own_activity" t.boolean "support_bot" t.string "preferred_language" + t.string "rss_token" t.boolean "email_opted_in" t.string "email_opted_in_ip" t.integer "email_opted_in_source_id" t.datetime "email_opted_in_at" t.integer "theme_id", limit: 2 t.integer "accepted_term_id" - t.string "feed_token" end add_index "users", ["admin"], name: "index_users_on_admin", using: :btree @@ -2691,14 +2687,13 @@ add_index "users", ["created_at"], name: "index_users_on_created_at", using: :btree add_index "users", ["email"], name: "index_users_on_email", unique: true, using: :btree add_index "users", ["email"], name: "index_users_on_email_trigram", using: :gin, opclasses: {"email"=>"gin_trgm_ops"} - add_index "users", ["feed_token"], name: "index_users_on_feed_token", using: :btree add_index "users", ["ghost"], name: "index_users_on_ghost", using: :btree add_index "users", ["incoming_email_token"], name: "index_users_on_incoming_email_token", using: :btree add_index "users", ["name"], name: "index_users_on_name", using: :btree add_index "users", ["name"], name: "index_users_on_name_trigram", using: :gin, opclasses: {"name"=>"gin_trgm_ops"} add_index "users", ["reset_password_token"], name: "index_users_on_reset_password_token", unique: true, using: :btree + add_index "users", ["rss_token"], name: "index_users_on_rss_token", using: :btree add_index "users", ["state"], name: "index_users_on_state", using: :btree - add_index "users", ["state"], name: "index_users_on_state_and_internal_attrs", where: "((ghost <> true) AND (support_bot <> true))", using: :btree add_index "users", ["support_bot"], name: "index_users_on_support_bot", using: :btree add_index "users", ["username"], name: "index_users_on_username", using: :btree add_index "users", ["username"], name: "index_users_on_username_trigram", using: :gin, opclasses: {"username"=>"gin_trgm_ops"} @@ -2828,8 +2823,6 @@ add_foreign_key "clusters", "users", on_delete: :nullify add_foreign_key "clusters_applications_helm", "clusters", on_delete: :cascade add_foreign_key "clusters_applications_ingress", "clusters", name: "fk_753a7b41c1", on_delete: :cascade - add_foreign_key "clusters_applications_jupyter", "clusters", on_delete: :cascade - add_foreign_key "clusters_applications_jupyter", "oauth_applications", on_delete: :nullify add_foreign_key "clusters_applications_prometheus", "clusters", name: "fk_557e773639", on_delete: :cascade add_foreign_key "clusters_applications_runners", "ci_runners", column: "runner_id", name: "fk_02de2ded36", on_delete: :nullify add_foreign_key "clusters_applications_runners", "clusters", on_delete: :cascade diff --git a/ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb b/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb similarity index 94% rename from ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb rename to ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb index fcf254ac8883..eba6930ba0ea 100644 --- a/ee/db/migrate/20180531221734_add_elt_dump_enabled_to_application_settings.rb +++ b/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb @@ -1,7 +1,7 @@ # See http://doc.gitlab.com/ce/development/migration_style_guide.html # for more information on how to write migrations for GitLab. -class AddEltDumpEnabledToApplicationSettings < ActiveRecord::Migration +class AddPseudonymizerEnabledToApplicationSettings < ActiveRecord::Migration include Gitlab::Database::MigrationHelpers # Set this constant to true if this migration requires downtime. diff --git a/rakes.patch b/rakes.patch deleted file mode 100644 index a587beb2a07c..000000000000 --- a/rakes.patch +++ /dev/null @@ -1,126 +0,0 @@ -diff --git a/lib/tasks/gitlab/artifacts/migrate.rake b/lib/tasks/gitlab/artifacts/migrate.rake -new file mode 100644 -index 00000000000..bfca4bfb3f7 ---- /dev/null -+++ b/lib/tasks/gitlab/artifacts/migrate.rake -@@ -0,0 +1,25 @@ -+require 'logger' -+require 'resolv-replace' -+ -+desc "GitLab | Migrate files for artifacts to comply with new storage format" -+namespace :gitlab do -+ namespace :artifacts do -+ task migrate: :environment do -+ logger = Logger.new(STDOUT) -+ logger.info('Starting transfer of artifacts') -+ -+ Ci::Build.joins(:project) -+ .with_artifacts_stored_locally -+ .find_each(batch_size: 10) do |build| -+ begin -+ build.artifacts_file.migrate!(ObjectStorage::Store::REMOTE) -+ build.artifacts_metadata.migrate!(ObjectStorage::Store::REMOTE) -+ -+ logger.info("Transferred artifacts of #{build.id} of #{build.artifacts_size} to object storage") -+ rescue => e -+ logger.error("Failed to transfer artifacts of #{build.id} with error: #{e.message}") -+ end -+ end -+ end -+ end -+end -diff --git a/lib/tasks/gitlab/exclusive_lease.rake b/lib/tasks/gitlab/exclusive_lease.rake -new file mode 100644 -index 00000000000..83722bf6d94 ---- /dev/null -+++ b/lib/tasks/gitlab/exclusive_lease.rake -@@ -0,0 +1,9 @@ -+namespace :gitlab do -+ namespace :exclusive_lease do -+ desc 'GitLab | Clear existing exclusive leases for specified scope (default: *)' -+ task :clear, [:scope] => [:environment] do |_, args| -+ args[:scope].nil? ? Gitlab::ExclusiveLease.reset_all! : Gitlab::ExclusiveLease.reset_all!(args[:scope]) -+ puts 'All exclusive lease entries were removed.' -+ end -+ end -+end -diff --git a/lib/tasks/gitlab/lfs/migrate.rake b/lib/tasks/gitlab/lfs/migrate.rake -new file mode 100644 -index 00000000000..a45e5ca91e0 ---- /dev/null -+++ b/lib/tasks/gitlab/lfs/migrate.rake -@@ -0,0 +1,22 @@ -+require 'logger' -+ -+desc "GitLab | Migrate LFS objects to remote storage" -+namespace :gitlab do -+ namespace :lfs do -+ task migrate: :environment do -+ logger = Logger.new(STDOUT) -+ logger.info('Starting transfer of LFS files to object storage') -+ -+ LfsObject.with_files_stored_locally -+ .find_each(batch_size: 10) do |lfs_object| -+ begin -+ lfs_object.file.migrate!(LfsObjectUploader::Store::REMOTE) -+ -+ logger.info("Transferred LFS object #{lfs_object.oid} of size #{lfs_object.size.to_i.bytes} to object storage") -+ rescue => e -+ logger.error("Failed to transfer LFS object #{lfs_object.oid} with error: #{e.message}") -+ end -+ end -+ end -+ end -+end -diff --git a/lib/tasks/gitlab/shell.rake b/lib/tasks/gitlab/shell.rake -index 1ce2eedb89c..56f3a916c87 100644 ---- a/lib/tasks/gitlab/shell.rake -+++ b/lib/tasks/gitlab/shell.rake -@@ -69,7 +69,7 @@ namespace :gitlab do - if File.exist?(path_to_repo) - print '-' - else -- if Gitlab::Shell.new.add_repository(project.repository_storage, -+ if Gitlab::Shell.new.create_repository(project.repository_storage, - project.disk_path) - print '.' - else -diff --git a/lib/tasks/gitlab/uploads/migrate.rake b/lib/tasks/gitlab/uploads/migrate.rake -new file mode 100644 -index 00000000000..c26c3ccb3be ---- /dev/null -+++ b/lib/tasks/gitlab/uploads/migrate.rake -@@ -0,0 +1,33 @@ -+namespace :gitlab do -+ namespace :uploads do -+ desc 'GitLab | Uploads | Migrate the uploaded files to object storage' -+ task :migrate, [:uploader_class, :model_class, :mounted_as] => :environment do |task, args| -+ batch_size = ENV.fetch('BATCH', 200).to_i -+ @to_store = ObjectStorage::Store::REMOTE -+ @mounted_as = args.mounted_as&.gsub(':', '')&.to_sym -+ @uploader_class = args.uploader_class.constantize -+ @model_class = args.model_class.constantize -+ -+ uploads.each_batch(of: batch_size, &method(:enqueue_batch)) # rubocop: disable Cop/InBatches -+ end -+ -+ def enqueue_batch(batch, index) -+ job = ObjectStorage::MigrateUploadsWorker.enqueue!(batch, -+ @mounted_as, -+ @to_store) -+ puts "Enqueued job ##{index}: #{job}" -+ rescue ObjectStorage::MigrateUploadsWorker::SanityCheckError => e -+ # continue for the next batch -+ puts "Could not enqueue batch (#{batch.ids}) #{e.message}".color(:red) -+ end -+ -+ def uploads -+ Upload.class_eval { include EachBatch } unless Upload < EachBatch -+ -+ Upload -+ .where.not(store: @to_store) -+ .where(uploader: @uploader_class.to_s, -+ model_type: @model_class.base_class.sti_name) -+ end -+ end -+end -- GitLab From a6386bf26f0228e237fb3a592a7d86475a698581 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Thu, 7 Jun 2018 19:19:29 -0400 Subject: [PATCH 26/63] added specs and renamed stuff accordingly --- .../_pseudonymizer.html.haml | 23 ++++++++++ .../admin/application_settings/show.html.haml | 2 +- app/workers/pseudonymizer_worker.rb | 12 ++--- config/gitlab.yml.example | 12 ++--- config/initializers/1_settings.rb | 6 +-- .../admin/application_settings_controller.rb | 4 ++ ee/app/models/ee/application_setting.rb | 2 +- .../table.rb => pseudonymizer/dumper.rb} | 4 +- .../manifest.yml | 0 .../options.rb | 6 ++- .../uploader.rb} | 45 +++++++++---------- lib/tasks/gitlab/db.rake | 12 ++--- .../dumper_spec.rb} | 5 ++- spec/lib/pseudonymizer/uploader_spec.rb | 44 ++++++++++++++++++ spec/support/helpers/stub_object_storage.rb | 12 ++++- 15 files changed, 137 insertions(+), 52 deletions(-) create mode 100644 app/views/admin/application_settings/_pseudonymizer.html.haml rename lib/{pseudonymity/table.rb => pseudonymizer/dumper.rb} (98%) rename lib/{pseudonymity => pseudonymizer}/manifest.yml (100%) rename lib/{pseudonymity => pseudonymizer}/options.rb (74%) rename lib/{pseudonymity/upload_service.rb => pseudonymizer/uploader.rb} (78%) rename spec/lib/{pseudonymity_spec.rb => pseudonymizer/dumper_spec.rb} (91%) create mode 100644 spec/lib/pseudonymizer/uploader_spec.rb diff --git a/app/views/admin/application_settings/_pseudonymizer.html.haml b/app/views/admin/application_settings/_pseudonymizer.html.haml new file mode 100644 index 000000000000..cabd3f7b6cc7 --- /dev/null +++ b/app/views/admin/application_settings/_pseudonymizer.html.haml @@ -0,0 +1,23 @@ += form_for @application_setting, url: admin_application_settings_path, html: { class: 'fieldset-form' } do |f| + = form_errors(@application_setting) + + %fieldset + .form-group.row + .offset-sm-2.col-sm-10 + - is_enabled = @application_setting.pseudonymizer_enabled? + - is_available = @application_setting.pseudonymizer_available? + .form-check + = f.label :pseudonymizer_enabled do + = f.check_box :pseudonymizer_enabled, disabled: !is_available + Enable Pseudonymizer Cron Job + .form-text.text-muted + - if is_enabled + = pseudonymizer_description_text + - else + - if is_available + = pseudonymizer_disabled_description_text + - else + = pseudonymizer_unavailable_description_text + + = f.submit 'Save changes', class: "btn btn-success" + diff --git a/app/views/admin/application_settings/show.html.haml b/app/views/admin/application_settings/show.html.haml index 22246f2aa5f4..4d2a8e1e9389 100644 --- a/app/views/admin/application_settings/show.html.haml +++ b/app/views/admin/application_settings/show.html.haml @@ -237,7 +237,7 @@ .settings-content = render 'usage' -- if License.feature_available?(:pseudonymizer) +- if Gitlab::CurrentSettings.pseudonymizer_can_be_configured? %section.settings.as-usage.no-animate#js-elt-database-dump-settings{ class: ('expanded' if expanded) } .settings-header %h4 diff --git a/app/workers/pseudonymizer_worker.rb b/app/workers/pseudonymizer_worker.rb index 6296e4c12d44..1def2a86dd39 100644 --- a/app/workers/pseudonymizer_worker.rb +++ b/app/workers/pseudonymizer_worker.rb @@ -5,16 +5,16 @@ class PseudonymizerWorker def perform return unless Gitlab::CurrentSettings.pseudonymizer_enabled? - options = Pseudonymity::Options.new( + options = Pseudonymizer::Options.new( config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), start_at: Time.now.utc ) - table = Pseudonymity::Table.new(options) - table.tables_to_csv + dumper = Pseudonymizer::Dumper.new(options) + dumper.tables_to_csv - upload = Pseudonymity::UploadService.new(options) - upload.upload - upload.cleanup + uploader = Pseudonymizer::Uploader.new(options) + uploader.upload + uploader.cleanup end end diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index dd39534b5890..b7425e757bca 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -733,9 +733,9 @@ production: &base pseudonymizer: enabled: false # Tables manifest that specifies the fields to extract and pseudonymize. - manifest: config/pseudonymizer.yml - # remote_directory: 'gitlab-elt' + manifest: lib/pseudonymizer/manifest.yml upload: + # remote_directory: 'gitlab-elt' # Fog storage connection settings, see http://fog.io/storage/ . connection: # provider: AWS @@ -896,16 +896,16 @@ test: path: tmp/tests/backups pseudonymizer: enabled: false - manifest: config/pseudonymizer.test.yml + manifest: lib/pseudonymizer/manifest.yml upload: # The remote 'directory' to store the CSV files. For S3, this would be the bucket name. remote_directory: gitlab-elt.test # Fog storage connection settings, see http://fog.io/storage/ connection: - provider: AWS + provider: AWS # Only AWS supported at the moment + aws_access_key_id: AWS_ACCESS_KEY_ID + aws_secret_access_key: AWS_SECRET_ACCESS_KEY region: us-east-1 - aws_access_key_id: minio - aws_secret_access_key: gdk-minio gitlab_shell: path: tmp/tests/gitlab-shell/ hooks_path: tmp/tests/gitlab-shell/hooks/ diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index f8b4f4408df3..898982fbed89 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -370,9 +370,9 @@ Settings.cron_jobs['gitlab_usage_ping_worker']['cron'] ||= Settings.__send__(:cron_for_usage_ping) Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWorker' -Settings.cron_jobs['gitlab_elt_database_dump'] ||= Settingslogic.new({}) -Settings.cron_jobs['gitlab_elt_database_dump']['cron'] ||= '0 23 * * *'; -Settings.cron_jobs['gitlab_elt_database_dump']['job_class'] ||= 'GitlabEltDataDumpWorker'; +Settings.cron_jobs['pseudonymizer'] ||= Settingslogic.new({}) +Settings.cron_jobs['pseudonymizer']['cron'] ||= '0 23 * * *'; +Settings.cron_jobs['pseudonymizer']['job_class'] ||= 'PseudonymizerWorker'; Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *' diff --git a/ee/app/controllers/ee/admin/application_settings_controller.rb b/ee/app/controllers/ee/admin/application_settings_controller.rb index 62ca0cbc8c17..f3a461d74362 100644 --- a/ee/app/controllers/ee/admin/application_settings_controller.rb +++ b/ee/app/controllers/ee/admin/application_settings_controller.rb @@ -20,6 +20,10 @@ def visible_application_setting_attributes attrs << :email_additional_text end + if License.feature_available?(:pseudonymizer) + attrs << :pseudonymizer_enabled + end + attrs end end diff --git a/ee/app/models/ee/application_setting.rb b/ee/app/models/ee/application_setting.rb index 69558d61620a..b18fb0694662 100644 --- a/ee/app/models/ee/application_setting.rb +++ b/ee/app/models/ee/application_setting.rb @@ -101,7 +101,7 @@ def defaults slack_app_id: nil, slack_app_secret: nil, slack_app_verification_token: nil, - pseudonymizer_enabled: Settings.pseudonymizer['enabled'], + pseudonymizer_enabled: Settings.pseudonymizer['enabled'] ) end end diff --git a/lib/pseudonymity/table.rb b/lib/pseudonymizer/dumper.rb similarity index 98% rename from lib/pseudonymity/table.rb rename to lib/pseudonymizer/dumper.rb index 4f70b65a7126..1ed685fa1505 100644 --- a/lib/pseudonymity/table.rb +++ b/lib/pseudonymizer/dumper.rb @@ -3,7 +3,7 @@ require 'csv' require 'yaml' -module Pseudonymity +module Pseudonymizer class Anon def initialize(fields) @anon_fields = fields @@ -25,7 +25,7 @@ def anonymize(results) end end - class Table + class Dumper attr_accessor :config def initialize(options) diff --git a/lib/pseudonymity/manifest.yml b/lib/pseudonymizer/manifest.yml similarity index 100% rename from lib/pseudonymity/manifest.yml rename to lib/pseudonymizer/manifest.yml diff --git a/lib/pseudonymity/options.rb b/lib/pseudonymizer/options.rb similarity index 74% rename from lib/pseudonymity/options.rb rename to lib/pseudonymizer/options.rb index e2fbeb504009..9c44fd855f59 100644 --- a/lib/pseudonymity/options.rb +++ b/lib/pseudonymizer/options.rb @@ -1,4 +1,4 @@ -module Pseudonymity +module Pseudonymizer class Options attr_reader :config attr_reader :start_at @@ -15,5 +15,9 @@ def output_dir def upload_dir File.join(self.start_at.iso8601) end + + def object_store_credentials + config.upload.connection.to_hash.deep_symbolize_keys + end end end diff --git a/lib/pseudonymity/upload_service.rb b/lib/pseudonymizer/uploader.rb similarity index 78% rename from lib/pseudonymity/upload_service.rb rename to lib/pseudonymizer/uploader.rb index 8f621c2f9bb4..5f4bb556c9c5 100644 --- a/lib/pseudonymity/upload_service.rb +++ b/lib/pseudonymizer/uploader.rb @@ -1,11 +1,13 @@ -module Pseudonymity - class UploadService +module Pseudonymizer + class Uploader RemoteStorageUnavailableError = Class.new(StandardError) def initialize(options, progress = nil) @progress = progress || $stdout + @config = options.config @output_dir = options.output_dir @upload_dir = options.upload_dir + @connection_params = options.object_store_credentials end def upload @@ -16,18 +18,6 @@ def upload end end - def upload_file(file, directory) - progress.print "\t#{file} ... " - - if directory.files.create(key: File.join(@upload_dir, File.basename(file)), - body: File.open(file), - public: false) - progress.puts "done".color(:green) - else - progress.puts "uploading CSV to #{remote_directory} failed".color(:red) - end - end - def cleanup progress.print "Deleting tmp directory #{@output_dir} ... " return unless File.exist?(@output_dir) @@ -41,24 +31,33 @@ def cleanup private - def config - Gitlab.config.pseudonymizer + attr_reader :progress + + def upload_file(file, directory) + progress.print "\t#{file} ... " + + if directory.files.create(key: File.join(@upload_dir, File.basename(file)), + body: File.open(file), + public: false) + progress.puts "done".color(:green) + else + progress.puts "uploading CSV to #{remote_directory} failed".color(:red) + end end def remote_directory - connection_settings = config.upload.connection - if connection_settings.blank? + if @connection_params.blank? progress.puts "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red) - raise RemoteStorageUnavailableError.new(connection_settings) + raise RemoteStorageUnavailableError.new(@config) end - connect_to_remote_directory(connection_settings) + connect_to_remote_directory end - def connect_to_remote_directory(connection_settings) + def connect_to_remote_directory # our settings use string keys, but Fog expects symbols - connection = ::Fog::Storage.new(connection_settings.symbolize_keys) - remote_dir = config.upload.remote_directory + connection = ::Fog::Storage.new(@connection_params) + remote_dir = @config.upload.remote_directory # We only attempt to create the directory for local backups. For AWS # and other cloud providers, we cannot guarantee the user will have diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index b98a01b2a137..da7de421e318 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -78,17 +78,17 @@ namespace :gitlab do abort "Pseudonymizer disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? - options = Pseudonymity::Options.new( + options = Pseudonymizer::Options.new( config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), start_at: Time.now.utc ) - table = Pseudonymity::Table.new(options) - table.tables_to_csv + dumper = Pseudonymizer::Dumper.new(options) + dumper.tables_to_csv - upload = Pseudonymity::UploadService.new(options, progress) - upload.upload - upload.cleanup + uploader = Pseudonymizer::Uploader.new(options, progress) + uploader.upload + uploader.cleanup end def progress diff --git a/spec/lib/pseudonymity_spec.rb b/spec/lib/pseudonymizer/dumper_spec.rb similarity index 91% rename from spec/lib/pseudonymity_spec.rb rename to spec/lib/pseudonymizer/dumper_spec.rb index 82e74aa33c8e..2c00cd9cee7d 100644 --- a/spec/lib/pseudonymity_spec.rb +++ b/spec/lib/pseudonymizer/dumper_spec.rb @@ -1,9 +1,10 @@ require 'spec_helper' -describe Gitlab::Pseudonymity do +describe Pseudonymizer::Dumper do let!(:project) { create(:project) } let(:base_dir) { Dir.mktmpdir } - subject(:pseudo) { Pseudonymity::Table.new } + let(:options) { Pseudonymizer::Options.new() } + subject(:pseudo) { described_class.new(options) } after do FileUtils.rm_rf(base_dir) diff --git a/spec/lib/pseudonymizer/uploader_spec.rb b/spec/lib/pseudonymizer/uploader_spec.rb new file mode 100644 index 000000000000..7fdbc28a88a7 --- /dev/null +++ b/spec/lib/pseudonymizer/uploader_spec.rb @@ -0,0 +1,44 @@ +require 'spec_helper' + +describe Pseudonymizer::Uploader do + let(:base_dir) { Dir.mktmpdir } + let(:options) do + Pseudonymizer::Options.new(config: Gitlab.config.pseudonymizer, + start_at: Time.now.utc) + end + let(:remote_directory) { subject.send(:remote_directory) } + subject { described_class.new(options) } + + def mock_file(file_name) + FileUtils.touch(File.join(base_dir, file_name)) + end + + before do + allow(options).to receive(:output_dir).and_return(base_dir) + stub_object_storage_pseudonymizer(options: options) + + 10.times {|i| mock_file("file_#{i}.test")} + mock_file("schema.yml") + mock_file("file_list.json") + end + + describe "#upload" do + it "upload all file in the directory" do + subject.upload + + expect(remote_directory.files.count).to eq(12) + end + end + + describe "#cleanup" do + it "cleans the directory" do + subject.cleanup + + expect(Dir[File.join(base_dir, "*")].length).to eq(0) + end + end + + after do + FileUtils.rm_rf(base_dir) + end +end diff --git a/spec/support/helpers/stub_object_storage.rb b/spec/support/helpers/stub_object_storage.rb index bceaf8277ee6..662702d1b744 100644 --- a/spec/support/helpers/stub_object_storage.rb +++ b/spec/support/helpers/stub_object_storage.rb @@ -15,9 +15,14 @@ def stub_object_storage_uploader( return unless enabled + stub_object_storage(connection_params: uploader.object_store_credentials, + remote_directory: remote_directory) + end + + def stub_object_storage(connection_params:, remote_directory:) Fog.mock! - ::Fog::Storage.new(uploader.object_store_credentials).tap do |connection| + ::Fog::Storage.new(connection_params).tap do |connection| begin connection.directories.create(key: remote_directory) rescue Excon::Error::Conflict @@ -57,4 +62,9 @@ def stub_object_storage_multipart_init(endpoint, upload_id = "upload_id") EOS end + + def stub_object_storage_pseudonymizer(options:) + stub_object_storage(connection_params: options.object_store_credentials, + remote_directory: options.config.upload.remote_directory) + end end -- GitLab From 55e5912e8e2c92b2768d0d175f0541a1de83d5a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 8 Jun 2018 11:43:11 -0400 Subject: [PATCH 27/63] trying to fix the specs --- config/initializers/1_settings.rb | 4 ++-- db/schema.rb | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index 898982fbed89..8e65d5a19637 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -371,8 +371,8 @@ Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWorker' Settings.cron_jobs['pseudonymizer'] ||= Settingslogic.new({}) -Settings.cron_jobs['pseudonymizer']['cron'] ||= '0 23 * * *'; -Settings.cron_jobs['pseudonymizer']['job_class'] ||= 'PseudonymizerWorker'; +Settings.cron_jobs['pseudonymizer']['cron'] ||= '0 23 * * *' +Settings.cron_jobs['pseudonymizer']['job_class'] ||= 'PseudonymizerWorker' Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *' diff --git a/db/schema.rb b/db/schema.rb index 99ac13e428f7..c5cf15b9144f 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -39,13 +39,13 @@ t.integer "cached_markdown_version" t.text "new_project_guidelines" t.text "new_project_guidelines_html" - t.string "favicon" t.text "header_message" t.text "header_message_html" t.text "footer_message" t.text "footer_message_html" t.text "message_background_color" t.text "message_font_color" + t.string "favicon" end create_table "application_setting_terms", force: :cascade do |t| @@ -2694,6 +2694,7 @@ add_index "users", ["reset_password_token"], name: "index_users_on_reset_password_token", unique: true, using: :btree add_index "users", ["rss_token"], name: "index_users_on_rss_token", using: :btree add_index "users", ["state"], name: "index_users_on_state", using: :btree + add_index "users", ["state"], name: "index_users_on_state_and_internal_attrs", where: "((ghost <> true) AND (support_bot <> true))", using: :btree add_index "users", ["support_bot"], name: "index_users_on_support_bot", using: :btree add_index "users", ["username"], name: "index_users_on_username", using: :btree add_index "users", ["username"], name: "index_users_on_username_trigram", using: :gin, opclasses: {"username"=>"gin_trgm_ops"} -- GitLab From 47b9ef3f550399d48e7155b2852442af3e665987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 8 Jun 2018 11:57:48 -0400 Subject: [PATCH 28/63] fix the schema.rb --- db/schema.rb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/db/schema.rb b/db/schema.rb index c5cf15b9144f..224e1be0bcb2 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -39,13 +39,13 @@ t.integer "cached_markdown_version" t.text "new_project_guidelines" t.text "new_project_guidelines_html" + t.string "favicon" t.text "header_message" t.text "header_message_html" t.text "footer_message" t.text "footer_message_html" t.text "message_background_color" t.text "message_font_color" - t.string "favicon" end create_table "application_setting_terms", force: :cascade do |t| @@ -549,10 +549,12 @@ t.integer "config_source" t.boolean "protected" t.integer "failure_reason" + t.integer "iid" end add_index "ci_pipelines", ["auto_canceled_by_id"], name: "index_ci_pipelines_on_auto_canceled_by_id", using: :btree add_index "ci_pipelines", ["pipeline_schedule_id"], name: "index_ci_pipelines_on_pipeline_schedule_id", using: :btree + add_index "ci_pipelines", ["project_id", "iid"], name: "index_ci_pipelines_on_project_id_and_iid", unique: true, where: "(iid IS NOT NULL)", using: :btree add_index "ci_pipelines", ["project_id", "ref", "status", "id"], name: "index_ci_pipelines_on_project_id_and_ref_and_status_and_id", using: :btree add_index "ci_pipelines", ["project_id", "sha"], name: "index_ci_pipelines_on_project_id_and_sha", using: :btree add_index "ci_pipelines", ["project_id"], name: "index_ci_pipelines_on_project_id", using: :btree @@ -2673,13 +2675,13 @@ t.boolean "notified_of_own_activity" t.boolean "support_bot" t.string "preferred_language" - t.string "rss_token" t.boolean "email_opted_in" t.string "email_opted_in_ip" t.integer "email_opted_in_source_id" t.datetime "email_opted_in_at" t.integer "theme_id", limit: 2 t.integer "accepted_term_id" + t.string "feed_token" end add_index "users", ["admin"], name: "index_users_on_admin", using: :btree @@ -2687,12 +2689,12 @@ add_index "users", ["created_at"], name: "index_users_on_created_at", using: :btree add_index "users", ["email"], name: "index_users_on_email", unique: true, using: :btree add_index "users", ["email"], name: "index_users_on_email_trigram", using: :gin, opclasses: {"email"=>"gin_trgm_ops"} + add_index "users", ["feed_token"], name: "index_users_on_feed_token", using: :btree add_index "users", ["ghost"], name: "index_users_on_ghost", using: :btree add_index "users", ["incoming_email_token"], name: "index_users_on_incoming_email_token", using: :btree add_index "users", ["name"], name: "index_users_on_name", using: :btree add_index "users", ["name"], name: "index_users_on_name_trigram", using: :gin, opclasses: {"name"=>"gin_trgm_ops"} add_index "users", ["reset_password_token"], name: "index_users_on_reset_password_token", unique: true, using: :btree - add_index "users", ["rss_token"], name: "index_users_on_rss_token", using: :btree add_index "users", ["state"], name: "index_users_on_state", using: :btree add_index "users", ["state"], name: "index_users_on_state_and_internal_attrs", where: "((ghost <> true) AND (support_bot <> true))", using: :btree add_index "users", ["support_bot"], name: "index_users_on_support_bot", using: :btree @@ -2824,6 +2826,8 @@ add_foreign_key "clusters", "users", on_delete: :nullify add_foreign_key "clusters_applications_helm", "clusters", on_delete: :cascade add_foreign_key "clusters_applications_ingress", "clusters", name: "fk_753a7b41c1", on_delete: :cascade + add_foreign_key "clusters_applications_jupyter", "clusters", on_delete: :cascade + add_foreign_key "clusters_applications_jupyter", "oauth_applications", on_delete: :nullify add_foreign_key "clusters_applications_prometheus", "clusters", name: "fk_557e773639", on_delete: :cascade add_foreign_key "clusters_applications_runners", "ci_runners", column: "runner_id", name: "fk_02de2ded36", on_delete: :nullify add_foreign_key "clusters_applications_runners", "clusters", on_delete: :cascade -- GitLab From ae91c723652f2f2f652b889461abf54ff411f9fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 8 Jun 2018 13:49:32 -0400 Subject: [PATCH 29/63] fix some legitimate specs failures --- app/workers/all_queues.yml | 1 + ee/app/helpers/ee/application_settings_helper.rb | 2 +- lib/pseudonymizer/dumper.rb | 6 +++--- spec/lib/pseudonymizer/dumper_spec.rb | 12 +++++++++--- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/app/workers/all_queues.yml b/app/workers/all_queues.yml index 40fed40398db..d8708433ff21 100644 --- a/app/workers/all_queues.yml +++ b/app/workers/all_queues.yml @@ -144,6 +144,7 @@ - cronjob:ldap_all_groups_sync - cronjob:ldap_sync - cronjob:update_all_mirrors +- cronjob:pseudonymizer - geo:geo_scheduler_scheduler - geo:geo_scheduler_primary_scheduler diff --git a/ee/app/helpers/ee/application_settings_helper.rb b/ee/app/helpers/ee/application_settings_helper.rb index 7fb08dcdc90c..febd073c39ce 100644 --- a/ee/app/helpers/ee/application_settings_helper.rb +++ b/ee/app/helpers/ee/application_settings_helper.rb @@ -68,7 +68,7 @@ def visible_attributes :slack_app_secret, :slack_app_verification_token, :allow_group_owners_to_manage_ldap, - :elt_database_dump_enabled + :pseudonymizer_enabled ] end diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 1ed685fa1505..6ccef5d799b8 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -26,7 +26,7 @@ def anonymize(results) end class Dumper - attr_accessor :config + attr_accessor :config, :output_dir def initialize(options) @config = options.config @@ -39,7 +39,7 @@ def initialize(options) def tables_to_csv tables = config["tables"] - FileUtils.mkdir_p(@output_dir) unless File.directory?(@output_dir) + FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir) new_tables = tables.map do |k, v| @schema[k] = {} @@ -55,7 +55,7 @@ def get_and_log_file_name(ext, prefix = nil, filename = nil) file_timestamp = filename || "#{prefix}_#{Time.now.to_i}" file_timestamp = "#{file_timestamp}.#{ext}" @output_files << file_timestamp - File.join(@output_dir, file_timestamp) + File.join(output_dir, file_timestamp) end def schema_to_yml diff --git a/spec/lib/pseudonymizer/dumper_spec.rb b/spec/lib/pseudonymizer/dumper_spec.rb index 2c00cd9cee7d..2a3627091cde 100644 --- a/spec/lib/pseudonymizer/dumper_spec.rb +++ b/spec/lib/pseudonymizer/dumper_spec.rb @@ -3,9 +3,16 @@ describe Pseudonymizer::Dumper do let!(:project) { create(:project) } let(:base_dir) { Dir.mktmpdir } - let(:options) { Pseudonymizer::Options.new() } + let(:options) do + Pseudonymizer::Options.new(config: Gitlab.config.pseudonymizer, + start_at: Time.now.utc) + end subject(:pseudo) { described_class.new(options) } + before do + allow(options).to receive(:output_dir).and_return(base_dir) + end + after do FileUtils.rm_rf(base_dir) end @@ -13,7 +20,6 @@ # create temp directory in before block describe 'Pseudo tables' do it 'outputs project tables to csv' do - pseudo.config["output"]["csv"] = base_dir pseudo.config["tables"] = { "projects" => { "whitelist" => %w(id name path description), @@ -21,7 +27,7 @@ } } - expect(pseudo.config["output"]["csv"]).to eq(base_dir) + expect(pseudo.output_dir).to eq(base_dir) # grab the first table it outputs. There would only be 1. project_table_file = pseudo.tables_to_csv[0] -- GitLab From a443f677dda5c27f04eb440fa590d9ffe9f22c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 8 Jun 2018 16:52:04 -0400 Subject: [PATCH 30/63] wip: making the fetch paginated --- lib/pseudonymizer/dumper.rb | 39 +++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 6ccef5d799b8..1000e4139eb9 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -4,6 +4,8 @@ require 'yaml' module Pseudonymizer + PAGE_SIZE = 10000 + class Anon def initialize(fields) @anon_fields = fields @@ -43,7 +45,8 @@ def tables_to_csv new_tables = tables.map do |k, v| @schema[k] = {} - table_to_csv(k, v["whitelist"], v["pseudo"]) + table_to_schema(k) + write_to_csv_file(k, table_page_results(k, v['whitelist'], v['pseudo'])) end schema_to_yml @@ -68,14 +71,33 @@ def file_list_to_json File.open(file_path, 'w') { |file| file.write(@output_files.to_json) } end - def table_to_csv(table, whitelist_columns, pseudonymity_columns) - sql = "SELECT #{whitelist_columns.join(",")} FROM #{table};" - results = ActiveRecord::Base.connection.exec_query(sql) + # yield every results, pagined, anonymized + def table_page_results(table, whitelist_columns, pseudonymity_columns) + anonymizer = Anon.new(pseudonymity_columns) + page = 0 + + Enumerator.new do |yielder| + loop do + offset = page * PAGE_SIZE + sql = "SELECT #{whitelist_columns.join(",")} FROM #{table} LIMIT #{PAGE_SIZE} OFFSET #{offset};" + + # a page of results + results = ActiveRecord::Base.connection.exec_query(sql) + break if results.empty? + binding.pry + anonymizer.anonymize(results).each { |result| yielder << result } + page += 1 + end + end + end + + def table_to_schema(table) type_results = ActiveRecord::Base.connection.columns(table) type_results = type_results.select do |c| @config["tables"][table]["whitelist"].include?(c.name) end + type_results = type_results.map do |c| data_type = c.sql_type @@ -86,10 +108,6 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) { name: c.name, data_type: data_type } end set_schema_column_types(table, type_results) - return if results.empty? - - anon = Anon.new(pseudonymity_columns) - write_to_csv_file(table, anon.anonymize(results)) end def set_schema_column_types(table, type_results) @@ -103,14 +121,15 @@ def set_schema_column_types(table, type_results) def write_to_csv_file(title, contents) Rails.logger.info "Writing #{title} ..." file_path = get_and_log_file_name("csv", title) + binding.pry column_names = contents.first.keys - contents = CSV.generate do |csv| + CSV.open(file_path, 'w') do |csv| csv << column_names + contents.each do |x| csv << x.values end end - File.open(file_path, 'w') { |file| file.write(contents) } file_path end -- GitLab From 7d5f50f4ddb9bd912c6071fec6630967a6165f04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 8 Jun 2018 19:35:36 -0400 Subject: [PATCH 31/63] wip: trying to find the memory hog --- lib/pseudonymizer/dumper.rb | 51 +++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 1000e4139eb9..48474245450a 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -4,7 +4,7 @@ require 'yaml' module Pseudonymizer - PAGE_SIZE = 10000 + PAGE_SIZE = 1000 class Anon def initialize(fields) @@ -14,11 +14,11 @@ def initialize(fields) def anonymize(results) columns = results.columns # Assume they all have the same table to_filter = @anon_fields & columns + secret = Rails.application.secrets[:secret_key_base] Enumerator.new do |yielder| results.each do |result| to_filter.each do |field| - secret = Rails.application.secrets[:secret_key_base] result[field] = OpenSSL::HMAC.hexdigest('SHA256', secret, result[field]) unless result[field].nil? end yielder << result @@ -43,15 +43,12 @@ def tables_to_csv FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir) - new_tables = tables.map do |k, v| - @schema[k] = {} - table_to_schema(k) - write_to_csv_file(k, table_page_results(k, v['whitelist'], v['pseudo'])) - end - schema_to_yml file_list_to_json - new_tables + + tables.each do |k, v| + table_to_csv(k, v['whitelist'], v['pseudo']) + end end def get_and_log_file_name(ext, prefix = nil, filename = nil) @@ -71,6 +68,15 @@ def file_list_to_json File.open(file_path, 'w') { |file| file.write(@output_files.to_json) } end + def table_to_csv(table, whitelist_columns, pseudonymity_columns) + @schema[table] = {} + table_to_schema(table) + write_to_csv_file(table, table_page_results(table, whitelist_columns, pseudonymity_columns)) + rescue => e + binding.pry + Rails.logger.error(e.message) + end + # yield every results, pagined, anonymized def table_page_results(table, whitelist_columns, pseudonymity_columns) anonymizer = Anon.new(pseudonymity_columns) @@ -79,14 +85,19 @@ def table_page_results(table, whitelist_columns, pseudonymity_columns) Enumerator.new do |yielder| loop do offset = page * PAGE_SIZE - sql = "SELECT #{whitelist_columns.join(",")} FROM #{table} LIMIT #{PAGE_SIZE} OFFSET #{offset};" + has_more = false + + sql = "SELECT #{whitelist_columns.join(",")} FROM #{table} LIMIT #{PAGE_SIZE} OFFSET #{offset}" # a page of results results = ActiveRecord::Base.connection.exec_query(sql) - break if results.empty? + raise StopIteration if results.empty? + + anonymizer.anonymize(results).lazy.each do |result| + has_more = true + yielder << result + end - binding.pry - anonymizer.anonymize(results).each { |result| yielder << result } page += 1 end end @@ -121,15 +132,17 @@ def set_schema_column_types(table, type_results) def write_to_csv_file(title, contents) Rails.logger.info "Writing #{title} ..." file_path = get_and_log_file_name("csv", title) - binding.pry - column_names = contents.first.keys - CSV.open(file_path, 'w') do |csv| - csv << column_names - contents.each do |x| - csv << x.values + CSV.open(file_path, 'w') do |csv| + contents.each_with_index do |row, i| + csv << row.keys if i == 0 # header + csv << row.values + csv.flush if i % PAGE_SIZE end end + + GC.start + file_path end -- GitLab From 1ed0b8018d8522b13954d4143e6eca6aeeef1a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 8 Jun 2018 20:51:23 -0400 Subject: [PATCH 32/63] wip: making the query paginated --- lib/pseudonymizer/dumper.rb | 15 ++++++--------- lib/pseudonymizer/options.rb | 8 ++------ lib/pseudonymizer/uploader.rb | 16 ++++++++++++---- lib/tasks/gitlab/db.rake | 4 +--- spec/support/helpers/stub_object_storage.rb | 2 +- 5 files changed, 22 insertions(+), 23 deletions(-) diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 48474245450a..c4045bd22c66 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -4,7 +4,7 @@ require 'yaml' module Pseudonymizer - PAGE_SIZE = 1000 + PAGE_SIZE = 10000 class Anon def initialize(fields) @@ -73,8 +73,7 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) table_to_schema(table) write_to_csv_file(table, table_page_results(table, whitelist_columns, pseudonymity_columns)) rescue => e - binding.pry - Rails.logger.error(e.message) + Rails.logger.error("Failed to export #{table}: #{e}") end # yield every results, pagined, anonymized @@ -91,16 +90,16 @@ def table_page_results(table, whitelist_columns, pseudonymity_columns) # a page of results results = ActiveRecord::Base.connection.exec_query(sql) - raise StopIteration if results.empty? - - anonymizer.anonymize(results).lazy.each do |result| + anonymizer.anonymize(results).each do |result| has_more = true yielder << result end + raise StopIteration unless has_more + page += 1 end - end + end.lazy end def table_to_schema(table) @@ -141,8 +140,6 @@ def write_to_csv_file(title, contents) end end - GC.start - file_path end diff --git a/lib/pseudonymizer/options.rb b/lib/pseudonymizer/options.rb index 9c44fd855f59..b1fab3a4f93b 100644 --- a/lib/pseudonymizer/options.rb +++ b/lib/pseudonymizer/options.rb @@ -9,15 +9,11 @@ def initialize(config: {}, start_at: Time.now.utc) end def output_dir - File.join('/tmp', 'gitlab-pseudonymizer', self.start_at.iso8601) + File.join('/tmp', 'gitlab-pseudonymizer', start_at.iso8601) end def upload_dir - File.join(self.start_at.iso8601) - end - - def object_store_credentials - config.upload.connection.to_hash.deep_symbolize_keys + File.join(start_at.iso8601) end end end diff --git a/lib/pseudonymizer/uploader.rb b/lib/pseudonymizer/uploader.rb index 5f4bb556c9c5..acb77403d32f 100644 --- a/lib/pseudonymizer/uploader.rb +++ b/lib/pseudonymizer/uploader.rb @@ -2,12 +2,21 @@ module Pseudonymizer class Uploader RemoteStorageUnavailableError = Class.new(StandardError) + def self.object_store_credentials + Gitlab.config.pseudonymizer.upload.connection.to_hash.deep_symbolize_keys + end + + def self.remote_directory + Gitlab.config.pseudonymizer.upload.remote_directory + end + def initialize(options, progress = nil) @progress = progress || $stdout @config = options.config @output_dir = options.output_dir @upload_dir = options.upload_dir - @connection_params = options.object_store_credentials + @remote_dir = self.class.remote_directory + @connection_params = self.class.object_store_credentials end def upload @@ -57,15 +66,14 @@ def remote_directory def connect_to_remote_directory # our settings use string keys, but Fog expects symbols connection = ::Fog::Storage.new(@connection_params) - remote_dir = @config.upload.remote_directory # We only attempt to create the directory for local backups. For AWS # and other cloud providers, we cannot guarantee the user will have # permission to create the bucket. if connection.service == ::Fog::Storage::Local - connection.directories.create(key: remote_dir) + connection.directories.create(key: @remote_dir) else - connection.directories.get(remote_dir) + connection.directories.get(@remote_dir) end end diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index da7de421e318..afdad25b0a01 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -93,9 +93,7 @@ namespace :gitlab do def progress if ENV['CRON'] - # We need an object we can say 'puts' and 'print' to; let's use a - # StringIO. - require 'stringio' + # Do not output progress for Cron StringIO.new else $stdout diff --git a/spec/support/helpers/stub_object_storage.rb b/spec/support/helpers/stub_object_storage.rb index 662702d1b744..744bfb24c985 100644 --- a/spec/support/helpers/stub_object_storage.rb +++ b/spec/support/helpers/stub_object_storage.rb @@ -64,7 +64,7 @@ def stub_object_storage_multipart_init(endpoint, upload_id = "upload_id") end def stub_object_storage_pseudonymizer(options:) - stub_object_storage(connection_params: options.object_store_credentials, + stub_object_storage(connection_params: Pseudonymizer::Uploader.object_store_credentials, remote_directory: options.config.upload.remote_directory) end end -- GitLab From 32fd829f36315e0400c8a601a497f1232a44b02c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Tue, 12 Jun 2018 13:27:04 -0400 Subject: [PATCH 33/63] use `.with_index` on Enumerator --- lib/pseudonymizer/dumper.rb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index c4045bd22c66..2a07a6f5e13d 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -14,12 +14,15 @@ def initialize(fields) def anonymize(results) columns = results.columns # Assume they all have the same table to_filter = @anon_fields & columns - secret = Rails.application.secrets[:secret_key_base] + key = Rails.application.secrets[:secret_key_base] + digest = OpenSSL::Digest.new('sha256') Enumerator.new do |yielder| results.each do |result| to_filter.each do |field| - result[field] = OpenSSL::HMAC.hexdigest('SHA256', secret, result[field]) unless result[field].nil? + next if result[field].nil? + + result[field] = OpenSSL::HMAC.hexdigest(digest, key, result[field]) end yielder << result end @@ -133,7 +136,7 @@ def write_to_csv_file(title, contents) file_path = get_and_log_file_name("csv", title) CSV.open(file_path, 'w') do |csv| - contents.each_with_index do |row, i| + contents.with_index do |row, i| csv << row.keys if i == 0 # header csv << row.values csv.flush if i % PAGE_SIZE -- GitLab From 035b9f73e96f84605329c3ffff815d6280d1589c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Tue, 12 Jun 2018 17:11:50 -0400 Subject: [PATCH 34/63] apply review feedback --- .../_pseudonymizer.html.haml | 5 +-- .../admin/application_settings/show.html.haml | 2 +- app/workers/pseudonymizer_worker.rb | 2 +- config/initializers/1_settings.rb | 2 +- .../helpers/ee/application_settings_helper.rb | 4 -- lib/pseudonymizer/dumper.rb | 18 ++++----- lib/pseudonymizer/options.rb | 6 +-- lib/pseudonymizer/uploader.rb | 37 +++++++++---------- lib/tasks/gitlab/db.rake | 21 ++--------- spec/lib/pseudonymizer/dumper_spec.rb | 11 ++---- spec/lib/pseudonymizer/uploader_spec.rb | 9 +++-- spec/support/helpers/stub_object_storage.rb | 4 +- 12 files changed, 48 insertions(+), 73 deletions(-) diff --git a/app/views/admin/application_settings/_pseudonymizer.html.haml b/app/views/admin/application_settings/_pseudonymizer.html.haml index cabd3f7b6cc7..429ee0e5eb75 100644 --- a/app/views/admin/application_settings/_pseudonymizer.html.haml +++ b/app/views/admin/application_settings/_pseudonymizer.html.haml @@ -14,10 +14,7 @@ - if is_enabled = pseudonymizer_description_text - else - - if is_available - = pseudonymizer_disabled_description_text - - else - = pseudonymizer_unavailable_description_text + = pseudonymizer_disabled_description_text = f.submit 'Save changes', class: "btn btn-success" diff --git a/app/views/admin/application_settings/show.html.haml b/app/views/admin/application_settings/show.html.haml index 4d2a8e1e9389..05e02e70a947 100644 --- a/app/views/admin/application_settings/show.html.haml +++ b/app/views/admin/application_settings/show.html.haml @@ -238,7 +238,7 @@ = render 'usage' - if Gitlab::CurrentSettings.pseudonymizer_can_be_configured? - %section.settings.as-usage.no-animate#js-elt-database-dump-settings{ class: ('expanded' if expanded) } + %section.settings.as-pseudonymizer.no-animate#js-pseudonymizer-settings{ class: ('expanded' if expanded) } .settings-header %h4 = _('Pseudonymizer Cron Job') diff --git a/app/workers/pseudonymizer_worker.rb b/app/workers/pseudonymizer_worker.rb index 1def2a86dd39..3cf583edeb5e 100644 --- a/app/workers/pseudonymizer_worker.rb +++ b/app/workers/pseudonymizer_worker.rb @@ -13,7 +13,7 @@ def perform dumper = Pseudonymizer::Dumper.new(options) dumper.tables_to_csv - uploader = Pseudonymizer::Uploader.new(options) + uploader = Pseudonymizer::Uploader.new(options, progress_output: File.open(File::NULL, "w")) uploader.upload uploader.cleanup end diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index 8e65d5a19637..55b1fbc1a70d 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -479,7 +479,7 @@ # Settings['pseudonymizer'] ||= Settingslogic.new({}) Settings.pseudonymizer['enabled'] = false if Settings.pseudonymizer['enabled'].nil? -Settings.pseudonymizer['manifest'] = Settings.pseudonymizer['manifest'] || "lib/pseudonymity/manifest.yml" +Settings.pseudonymizer['manifest'] = Settings.pseudonymizer['manifest'] || "lib/pseudonymizer/manifest.yml" Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil }) # Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600 diff --git a/ee/app/helpers/ee/application_settings_helper.rb b/ee/app/helpers/ee/application_settings_helper.rb index febd073c39ce..211271323ccc 100644 --- a/ee/app/helpers/ee/application_settings_helper.rb +++ b/ee/app/helpers/ee/application_settings_helper.rb @@ -43,10 +43,6 @@ def pseudonymizer_disabled_description_text _("The pseudonymizer database cron job is disabled. When enabled the cron job will send pseudoanonymized data to be processed and analyzed.") end - def pseudonymizer_unavailable_description_text - _("The pseudonymizer cron job is disabled. Once enabled, the cron job will send pseudoanonymized data to be processed and analyzed.") - end - override :visible_attributes def visible_attributes super + [ diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 2a07a6f5e13d..1d3704afbd50 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -36,8 +36,9 @@ class Dumper def initialize(options) @config = options.config @output_dir = options.output_dir + @start_at = options.start_at - @schema = {} + @schema = Hash.new { |h, k| h[k] = {} } @output_files = [] end @@ -49,13 +50,15 @@ def tables_to_csv schema_to_yml file_list_to_json - tables.each do |k, v| + tables.map do |k, v| table_to_csv(k, v['whitelist'], v['pseudo']) end end + private + def get_and_log_file_name(ext, prefix = nil, filename = nil) - file_timestamp = filename || "#{prefix}_#{Time.now.to_i}" + file_timestamp = filename || "#{prefix}_#{@start_at.to_i}" file_timestamp = "#{file_timestamp}.#{ext}" @output_files << file_timestamp File.join(output_dir, file_timestamp) @@ -72,7 +75,6 @@ def file_list_to_json end def table_to_csv(table, whitelist_columns, pseudonymity_columns) - @schema[table] = {} table_to_schema(table) write_to_csv_file(table, table_page_results(table, whitelist_columns, pseudonymity_columns)) rescue => e @@ -131,10 +133,10 @@ def set_schema_column_types(table, type_results) @schema[table]["gl_mapping_key"] = "id" end - def write_to_csv_file(title, contents) - Rails.logger.info "Writing #{title} ..." - file_path = get_and_log_file_name("csv", title) + def write_to_csv_file(table, contents) + file_path = get_and_log_file_name("csv", table) + Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}." CSV.open(file_path, 'w') do |csv| contents.with_index do |row, i| csv << row.keys if i == 0 # header @@ -145,7 +147,5 @@ def write_to_csv_file(title, contents) file_path end - - private :write_to_csv_file end end diff --git a/lib/pseudonymizer/options.rb b/lib/pseudonymizer/options.rb index b1fab3a4f93b..18809859977a 100644 --- a/lib/pseudonymizer/options.rb +++ b/lib/pseudonymizer/options.rb @@ -3,13 +3,13 @@ class Options attr_reader :config attr_reader :start_at - def initialize(config: {}, start_at: Time.now.utc) + def initialize(config: {}) @config = config - @start_at = start_at + @start_at = Time.now.utc end def output_dir - File.join('/tmp', 'gitlab-pseudonymizer', start_at.iso8601) + File.join(Dir.tmpdir, 'gitlab-pseudonymizer', start_at.iso8601) end def upload_dir diff --git a/lib/pseudonymizer/uploader.rb b/lib/pseudonymizer/uploader.rb index acb77403d32f..e2ea01456c88 100644 --- a/lib/pseudonymizer/uploader.rb +++ b/lib/pseudonymizer/uploader.rb @@ -1,7 +1,10 @@ module Pseudonymizer class Uploader + include Gitlab::Utils::StrongMemoize + RemoteStorageUnavailableError = Class.new(StandardError) + # Our settings use string keys, but Fog expects symbols def self.object_store_credentials Gitlab.config.pseudonymizer.upload.connection.to_hash.deep_symbolize_keys end @@ -10,8 +13,8 @@ def self.remote_directory Gitlab.config.pseudonymizer.upload.remote_directory end - def initialize(options, progress = nil) - @progress = progress || $stdout + def initialize(options, progress_output: nil) + @progress_output = progress_output || $stdout @config = options.config @output_dir = options.output_dir @upload_dir = options.upload_dir @@ -20,7 +23,7 @@ def initialize(options, progress = nil) end def upload - progress.puts "Uploading output files to remote storage #{remote_directory} ... " + progress_output.puts "Uploading output files to remote storage #{remote_directory}:" file_list.each do |file| upload_file(file, remote_directory) @@ -28,43 +31,37 @@ def upload end def cleanup - progress.print "Deleting tmp directory #{@output_dir} ... " return unless File.exist?(@output_dir) - if FileUtils.rm_rf(@output_dir) - progress.puts "done".color(:green) - else - progress.puts "failed".color(:red) - end + progress_output.print "Deleting tmp directory #{@output_dir} ... " + progress_output.puts FileUtils.rm_rf(@output_dir) ? "done".color(:green) : "failed".color(:red) end private - attr_reader :progress + attr_reader :progress_output def upload_file(file, directory) - progress.print "\t#{file} ... " + progress_output.print "\t#{file} ... " if directory.files.create(key: File.join(@upload_dir, File.basename(file)), body: File.open(file), public: false) - progress.puts "done".color(:green) + progress_output.puts "done".color(:green) else - progress.puts "uploading CSV to #{remote_directory} failed".color(:red) + progress_output.puts "failed".color(:red) end end def remote_directory - if @connection_params.blank? - progress.puts "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red) - raise RemoteStorageUnavailableError.new(@config) - end - - connect_to_remote_directory + strong_memoize(:remote_directory) { connect_to_remote_directory } end def connect_to_remote_directory - # our settings use string keys, but Fog expects symbols + if @connection_params.blank? + abort "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red) + end + connection = ::Fog::Storage.new(@connection_params) # We only attempt to create the directory for local backups. For AWS diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index afdad25b0a01..6c04dce61974 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -72,32 +72,19 @@ namespace :gitlab do desc 'Output pseudonymity dump of selected tables' task pseudonymizer: :environment do - unless License.feature_available? :pseudonymizer - raise "The pseudonymizer is not available with this license." - end - - abort "Pseudonymizer disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? + abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) + abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? options = Pseudonymizer::Options.new( - config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), - start_at: Time.now.utc + config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) ) dumper = Pseudonymizer::Dumper.new(options) dumper.tables_to_csv - uploader = Pseudonymizer::Uploader.new(options, progress) + uploader = Pseudonymizer::Uploader.new(options) uploader.upload uploader.cleanup end - - def progress - if ENV['CRON'] - # Do not output progress for Cron - StringIO.new - else - $stdout - end - end end end diff --git a/spec/lib/pseudonymizer/dumper_spec.rb b/spec/lib/pseudonymizer/dumper_spec.rb index 2a3627091cde..abb6723e819f 100644 --- a/spec/lib/pseudonymizer/dumper_spec.rb +++ b/spec/lib/pseudonymizer/dumper_spec.rb @@ -4,8 +4,9 @@ let!(:project) { create(:project) } let(:base_dir) { Dir.mktmpdir } let(:options) do - Pseudonymizer::Options.new(config: Gitlab.config.pseudonymizer, - start_at: Time.now.utc) + Pseudonymizer::Options.new( + config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) + ) end subject(:pseudo) { described_class.new(options) } @@ -17,7 +18,6 @@ FileUtils.rm_rf(base_dir) end - # create temp directory in before block describe 'Pseudo tables' do it 'outputs project tables to csv' do pseudo.config["tables"] = { @@ -31,7 +31,6 @@ # grab the first table it outputs. There would only be 1. project_table_file = pseudo.tables_to_csv[0] - # Ignore the `.` and `..` in the directory. expect(project_table_file.include? "projects_").to be true expect(project_table_file.include? ".csv").to be true @@ -40,9 +39,7 @@ File.foreach(project_table_file).with_index do |line, line_num| if line_num == 0 columns = line.split(",") - end - - if line_num == 1 + elsif line_num == 1 project_data = line.split(",") break end diff --git a/spec/lib/pseudonymizer/uploader_spec.rb b/spec/lib/pseudonymizer/uploader_spec.rb index 7fdbc28a88a7..a9ce9ea3c7de 100644 --- a/spec/lib/pseudonymizer/uploader_spec.rb +++ b/spec/lib/pseudonymizer/uploader_spec.rb @@ -3,10 +3,11 @@ describe Pseudonymizer::Uploader do let(:base_dir) { Dir.mktmpdir } let(:options) do - Pseudonymizer::Options.new(config: Gitlab.config.pseudonymizer, - start_at: Time.now.utc) + Pseudonymizer::Options.new( + config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) + ) end - let(:remote_directory) { subject.send(:remote_directory) } + let(:remote_directory) { described_class.remote_directory } subject { described_class.new(options) } def mock_file(file_name) @@ -15,7 +16,7 @@ def mock_file(file_name) before do allow(options).to receive(:output_dir).and_return(base_dir) - stub_object_storage_pseudonymizer(options: options) + stub_object_storage_pseudonymizer 10.times {|i| mock_file("file_#{i}.test")} mock_file("schema.yml") diff --git a/spec/support/helpers/stub_object_storage.rb b/spec/support/helpers/stub_object_storage.rb index 744bfb24c985..e757f32aecb5 100644 --- a/spec/support/helpers/stub_object_storage.rb +++ b/spec/support/helpers/stub_object_storage.rb @@ -63,8 +63,8 @@ def stub_object_storage_multipart_init(endpoint, upload_id = "upload_id") EOS end - def stub_object_storage_pseudonymizer(options:) + def stub_object_storage_pseudonymizer stub_object_storage(connection_params: Pseudonymizer::Uploader.object_store_credentials, - remote_directory: options.config.upload.remote_directory) + remote_directory: Gitlab.config.pseudonymizer.upload.remote_directory) end end -- GitLab From a4b43b899d6ec8664bb7f120a8666f37b3acd91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Wed, 13 Jun 2018 09:02:49 -0400 Subject: [PATCH 35/63] fix failing specs --- config/gitlab.yml.example | 1 + config/initializers/1_settings.rb | 6 +++--- spec/lib/pseudonymizer/uploader_spec.rb | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index b7425e757bca..edd2114bf4b6 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -311,6 +311,7 @@ production: &base geo_migrated_local_files_clean_up_worker: cron: "15 */6 * * *" + # Export pseudonymized data in CSV format for analysis pseudonymizer_worker: cron: "0 * * * *" diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index 55b1fbc1a70d..f9df0f4aaaa7 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -370,9 +370,9 @@ Settings.cron_jobs['gitlab_usage_ping_worker']['cron'] ||= Settings.__send__(:cron_for_usage_ping) Settings.cron_jobs['gitlab_usage_ping_worker']['job_class'] = 'GitlabUsagePingWorker' -Settings.cron_jobs['pseudonymizer'] ||= Settingslogic.new({}) -Settings.cron_jobs['pseudonymizer']['cron'] ||= '0 23 * * *' -Settings.cron_jobs['pseudonymizer']['job_class'] ||= 'PseudonymizerWorker' +Settings.cron_jobs['pseudonymizer_worker'] ||= Settingslogic.new({}) +Settings.cron_jobs['pseudonymizer_worker']['cron'] ||= '0 23 * * *' +Settings.cron_jobs['pseudonymizer_worker']['job_class'] ||= 'PseudonymizerWorker' Settings.cron_jobs['schedule_update_user_activity_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['schedule_update_user_activity_worker']['cron'] ||= '30 0 * * *' diff --git a/spec/lib/pseudonymizer/uploader_spec.rb b/spec/lib/pseudonymizer/uploader_spec.rb index a9ce9ea3c7de..893556b64c34 100644 --- a/spec/lib/pseudonymizer/uploader_spec.rb +++ b/spec/lib/pseudonymizer/uploader_spec.rb @@ -7,7 +7,7 @@ config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) ) end - let(:remote_directory) { described_class.remote_directory } + let(:remote_directory) { subject.send(:remote_directory) } subject { described_class.new(options) } def mock_file(file_name) @@ -27,7 +27,7 @@ def mock_file(file_name) it "upload all file in the directory" do subject.upload - expect(remote_directory.files.count).to eq(12) + expect(remote_directory.files.all.count).to eq(12) end end -- GitLab From 9c010a97922e09d76650779d77978188c608e8bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Mon, 18 Jun 2018 13:04:32 -0400 Subject: [PATCH 36/63] apply feedback --- app/workers/pseudonymizer_worker.rb | 2 +- lib/pseudonymizer/dumper.rb | 45 +++++++++++++++-------- lib/pseudonymizer/manifest.yml | 45 ----------------------- lib/pseudonymizer/options.rb | 8 ++-- lib/tasks/gitlab/db.rake | 3 +- spec/lib/pseudonymizer/dumper_spec.rb | 53 ++++++++++++++++++++------- 6 files changed, 76 insertions(+), 80 deletions(-) diff --git a/app/workers/pseudonymizer_worker.rb b/app/workers/pseudonymizer_worker.rb index 3cf583edeb5e..389879eaf4ce 100644 --- a/app/workers/pseudonymizer_worker.rb +++ b/app/workers/pseudonymizer_worker.rb @@ -7,7 +7,7 @@ def perform options = Pseudonymizer::Options.new( config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), - start_at: Time.now.utc + output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR'] ) dumper = Pseudonymizer::Dumper.new(options) diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 1d3704afbd50..cc688e6cb0a0 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -4,7 +4,7 @@ require 'yaml' module Pseudonymizer - PAGE_SIZE = 10000 + PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000) class Anon def initialize(fields) @@ -38,45 +38,57 @@ def initialize(options) @output_dir = options.output_dir @start_at = options.start_at + reset! + end + + def reset! @schema = Hash.new { |h, k| h[k] = {} } @output_files = [] end def tables_to_csv - tables = config["tables"] + reset! + tables = config["tables"] FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir) schema_to_yml - file_list_to_json - - tables.map do |k, v| + @output_files = tables.map do |k, v| table_to_csv(k, v['whitelist'], v['pseudo']) end + file_list_to_json + + @output_files end private - def get_and_log_file_name(ext, prefix = nil, filename = nil) - file_timestamp = filename || "#{prefix}_#{@start_at.to_i}" - file_timestamp = "#{file_timestamp}.#{ext}" - @output_files << file_timestamp + def output_filename(basename = nil, ext = "csv.gz") + file_timestamp = "#{basename}.#{ext}" File.join(output_dir, file_timestamp) end def schema_to_yml - file_path = get_and_log_file_name("yml", "schema") + file_path = output_filename("schema", "yml") File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) } end def file_list_to_json - file_path = get_and_log_file_name("json", nil, "file_list") - File.open(file_path, 'w') { |file| file.write(@output_files.to_json) } + file_path = output_filename("file_list", "json") + File.open(file_path, 'w') do |file| + relative_files = @output_files.map(&File.method(:basename)) + file.write(relative_files.to_json) + end end def table_to_csv(table, whitelist_columns, pseudonymity_columns) table_to_schema(table) - write_to_csv_file(table, table_page_results(table, whitelist_columns, pseudonymity_columns)) + write_to_csv_file( + table, + table_page_results(table, + whitelist_columns, + pseudonymity_columns) + ) rescue => e Rails.logger.error("Failed to export #{table}: #{e}") end @@ -134,15 +146,16 @@ def set_schema_column_types(table, type_results) end def write_to_csv_file(table, contents) - file_path = get_and_log_file_name("csv", table) + file_path = output_filename(table, "csv.gz") Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}." - CSV.open(file_path, 'w') do |csv| + Zlib::GzipWriter.open(file_path) do |io| + csv = CSV.new(io) contents.with_index do |row, i| csv << row.keys if i == 0 # header csv << row.values - csv.flush if i % PAGE_SIZE end + csv.close end file_path diff --git a/lib/pseudonymizer/manifest.yml b/lib/pseudonymizer/manifest.yml index 9ff62ab0106e..702861c826fe 100644 --- a/lib/pseudonymizer/manifest.yml +++ b/lib/pseudonymizer/manifest.yml @@ -96,15 +96,8 @@ tables: - author_id - assignee_id - iid - - cached_markdown_version - updated_by_id - last_edited_by_id - - lock_version - - start_date - - end_date - - last_edited_at - - created_at - - updated_at issue_assignees: whitelist: - user_id @@ -208,8 +201,6 @@ tables: - title - color - project_id - - created_at - - updated_at - template - type - group_id @@ -423,12 +414,10 @@ tables: - created_at - updated_at - project_id - - attachment - line_code - commit_id - noteable_id - system - - st_diff - updated_by_id - type - position @@ -436,35 +425,18 @@ tables: - resolved_at - resolved_by_id - discussion_id - - note_html - - cached_markdown_version - change_position - resolved_by_push pseudo: - id - note - - noteable_type - author_id - - created_at - - updated_at - project_id - - attachment - - line_code - commit_id - noteable_id - - system - - st_diff - updated_by_id - - type - - position - - original_position - - resolved_at - resolved_by_id - discussion_id - - note_html - - cached_markdown_version - - change_position - - resolved_by_push notification_settings: whitelist: - id @@ -492,8 +464,6 @@ tables: - source_id - source_type - level - - created_at - - updated_at - new_note - new_issue - reopen_issue @@ -526,8 +496,6 @@ tables: pseudo: - id - project_id - - created_at - - updated_at - enabled - domain project_custom_attributes: @@ -540,8 +508,6 @@ tables: - value pseudo: - id - - created_at - - updated_at - project_id - key - value @@ -565,8 +531,6 @@ tables: - wiki_access_level - snippets_access_level - builds_access_level - - created_at - - updated_at - repository_access_level project_group_links: whitelist: @@ -581,8 +545,6 @@ tables: - id - project_id - group_id - - created_at - - updated_at - group_access - expires_at project_import_data: @@ -615,8 +577,6 @@ tables: - last_update_started_at - last_update_scheduled_at - next_execution_timestamp - - created_at - - updated_at project_repository_states: whitelist: - id @@ -730,8 +690,6 @@ tables: - name - path - description - - created_at - - updated_at - creator_id - namespace_id - last_activity_at @@ -875,7 +833,6 @@ tables: pseudo: - id - email - - remember_created_at - current_sign_in_ip - last_sign_in_ip - name @@ -897,12 +854,10 @@ tables: - hide_project_limit - note - unlock_token - - otp_grace_period_started_at - external - incoming_email_token - organization - auditor - two_factor_grace_period - - ghost - rss_token - theme_id diff --git a/lib/pseudonymizer/options.rb b/lib/pseudonymizer/options.rb index 18809859977a..990d01d0946b 100644 --- a/lib/pseudonymizer/options.rb +++ b/lib/pseudonymizer/options.rb @@ -2,14 +2,14 @@ module Pseudonymizer class Options attr_reader :config attr_reader :start_at + attr_reader :output_dir - def initialize(config: {}) + def initialize(config: {}, output_dir: nil) @config = config @start_at = Time.now.utc - end - def output_dir - File.join(Dir.tmpdir, 'gitlab-pseudonymizer', start_at.iso8601) + base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer') + @output_dir = File.join(base_dir, start_at.iso8601) end def upload_dir diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index 6c04dce61974..3c39be2afdb9 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -76,7 +76,8 @@ namespace :gitlab do abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? options = Pseudonymizer::Options.new( - config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) + config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), + output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR'] ) dumper = Pseudonymizer::Dumper.new(options) diff --git a/spec/lib/pseudonymizer/dumper_spec.rb b/spec/lib/pseudonymizer/dumper_spec.rb index abb6723e819f..e6163bb49a2a 100644 --- a/spec/lib/pseudonymizer/dumper_spec.rb +++ b/spec/lib/pseudonymizer/dumper_spec.rb @@ -20,9 +20,10 @@ describe 'Pseudo tables' do it 'outputs project tables to csv' do + column_names = %w(id name path description) pseudo.config["tables"] = { "projects" => { - "whitelist" => %w(id name path description), + "whitelist" => column_names, "pseudo" => %w(id) } } @@ -31,26 +32,52 @@ # grab the first table it outputs. There would only be 1. project_table_file = pseudo.tables_to_csv[0] + expect(project_table_file).to include("projects.csv.gz") - expect(project_table_file.include? "projects_").to be true - expect(project_table_file.include? ".csv").to be true columns = [] project_data = [] - File.foreach(project_table_file).with_index do |line, line_num| - if line_num == 0 - columns = line.split(",") - elsif line_num == 1 - project_data = line.split(",") - break - end + Zlib::GzipReader.open(project_table_file) do |gz| + csv = CSV.new(gz, headers: true) + # csv.shift # read the header row + project_data = csv.gets + columns = csv.headers end + # check if CSV columns are correct - expect(columns.to_set).to eq(%W(id name path description\n).to_set) + expect(columns).to include(*column_names) # is it pseudonymous - expect(project_data[0]).not_to eq(1) # sha 256 is 64 chars in length - expect(project_data[0].length).to eq(64) + expect(project_data["id"].length).to eq(64) + end + end + + describe "manifest is valid" do + it "all tables exist" do + existing_tables = ActiveRecord::Base.connection.tables + tables = options.config['tables'].keys + + expect(existing_tables).to include(*tables) + end + + it "all whitelisted attributes exist" do + options.config['tables'].each do |table, table_def| + whitelisted = table_def['whitelist'] + existing_columns = ActiveRecord::Base.connection.columns(table.to_sym).map(&:name) + diff = whitelisted - existing_columns + + expect(diff).to be_empty, "#{table} should define columns #{whitelisted.inspect}: missing #{diff.inspect}" + end + end + + it "all pseudonymized attributes are whitelisted" do + options.config['tables'].each do |table, table_def| + whitelisted = table_def['whitelist'] + pseudonymized = table_def['pseudo'] + diff = pseudonymized - whitelisted + + expect(diff).to be_empty, "#{table} should whitelist columns #{pseudonymized.inspect}: missing #{diff.inspect}" + end end end end -- GitLab From f7bb758881e6c49b755b53e384350829fa79765a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Mon, 18 Jun 2018 13:29:30 -0400 Subject: [PATCH 37/63] =?UTF-8?q?`rss=5Ftoken`=20=E2=86=92=20`feed=5Ftoken?= =?UTF-8?q?`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/pseudonymizer/manifest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pseudonymizer/manifest.yml b/lib/pseudonymizer/manifest.yml index 702861c826fe..7e208a458c4c 100644 --- a/lib/pseudonymizer/manifest.yml +++ b/lib/pseudonymizer/manifest.yml @@ -828,7 +828,7 @@ tables: - notified_of_own_activity - support_bot - preferred_language - - rss_token + - feed_token - theme_id pseudo: - id @@ -859,5 +859,5 @@ tables: - organization - auditor - two_factor_grace_period - - rss_token + - feed_token - theme_id -- GitLab From 1b121bf0ef74e398e6eed674aa61c9c1ce471331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Mon, 18 Jun 2018 15:04:38 -0400 Subject: [PATCH 38/63] add changelog --- ee/changelogs/unreleased/gitlab-elt.yml | 5 +++++ lib/pseudonymizer/dumper.rb | 1 + 2 files changed, 6 insertions(+) create mode 100644 ee/changelogs/unreleased/gitlab-elt.yml diff --git a/ee/changelogs/unreleased/gitlab-elt.yml b/ee/changelogs/unreleased/gitlab-elt.yml new file mode 100644 index 000000000000..3bc6dc504a02 --- /dev/null +++ b/ee/changelogs/unreleased/gitlab-elt.yml @@ -0,0 +1,5 @@ +--- +title: Pseudonymizer to safely export data for analytics. +merge_request: 5532 +author: +type: added diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index cc688e6cb0a0..afdae77de236 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -91,6 +91,7 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) ) rescue => e Rails.logger.error("Failed to export #{table}: #{e}") + raise e end # yield every results, pagined, anonymized -- GitLab From cc732a2d7962f55c596141670b4e31c0ea428841 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Mon, 18 Jun 2018 16:09:29 -0400 Subject: [PATCH 39/63] reworking the manifest - removing tokens - leave dates as-is --- lib/pseudonymizer/manifest.yml | 50 +--------------------------------- 1 file changed, 1 insertion(+), 49 deletions(-) diff --git a/lib/pseudonymizer/manifest.yml b/lib/pseudonymizer/manifest.yml index 7e208a458c4c..4c48e1c8ae6a 100644 --- a/lib/pseudonymizer/manifest.yml +++ b/lib/pseudonymizer/manifest.yml @@ -70,6 +70,7 @@ tables: - updated_at pseudo: - id + - epic_id epics: whitelist: - id @@ -78,7 +79,6 @@ tables: - author_id - assignee_id - iid - - cached_markdown_version - updated_by_id - last_edited_by_id - lock_version @@ -156,11 +156,9 @@ tables: - project_id - description - milestone_id - - state - updated_by_id - moved_to_id - discussion_locked - - closed_at label_links: whitelist: - id @@ -394,10 +392,6 @@ tables: - request_access_enabled - ldap_sync_status - ldap_sync_error - - ldap_sync_last_update_at - - ldap_sync_last_successful_update_at - - ldap_sync_last_sync_at - - lfs_enabled - parent_id - shared_runners_minutes_limit - repository_size_limit @@ -546,22 +540,13 @@ tables: - project_id - group_id - group_access - - expires_at project_import_data: whitelist: - id - project_id - - data - - encrypted_credentials - - encrypted_credentials_iv - - encrypted_credentials_salt pseudo: - id - project_id - - data - - encrypted_credentials - - encrypted_credentials_iv - - encrypted_credentials_salt project_mirror_data: whitelist: - id @@ -573,10 +558,6 @@ tables: pseudo: - id - project_id - - retry_count - - last_update_started_at - - last_update_scheduled_at - - next_execution_timestamp project_repository_states: whitelist: - id @@ -647,7 +628,6 @@ tables: - import_error - ci_id - shared_runners_enabled - - runners_token - build_coverage_regex - build_allow_git_fetch - build_timeout @@ -672,7 +652,6 @@ tables: - auto_cancel_pending_pipelines - service_desk_enabled - import_jid - - cached_markdown_version - delete_error - last_repository_updated_at - disable_overriding_approvers_per_merge_request @@ -681,10 +660,8 @@ tables: - remote_mirror_available_overridden - only_mirror_protected_branches - pull_mirror_available_overridden - - jobs_cache_index - mirror_overwrites_diverged_branches - external_authorization_classification_label - - external_webhook_token pseudo: - id - name @@ -692,7 +669,6 @@ tables: - description - creator_id - namespace_id - - last_activity_at - import_url - visibility_level - archived @@ -700,21 +676,17 @@ tables: - import_status - merge_requests_template - star_count - - merge_requests_rebase_enabled - import_type - import_source - approvals_before_merge - reset_approvals_on_push - - merge_requests_ff_only_enabled - issues_template - mirror - - mirror_last_update_at - mirror_last_successful_update_at - mirror_user_id - import_error - ci_id - shared_runners_enabled - - runners_token - build_coverage_regex - build_allow_git_fetch - build_timeout @@ -722,36 +694,24 @@ tables: - pending_delete - public_builds - last_repository_check_failed - - last_repository_check_at - - container_registry_enabled - only_allow_merge_if_pipeline_succeeds - - has_external_issue_tracker - repository_storage - repository_read_only - - request_access_enabled - - has_external_wiki - ci_config_path - - lfs_enabled - description_html - only_allow_merge_if_all_discussions_are_resolved - repository_size_limit - - printing_merge_request_link_enabled - auto_cancel_pending_pipelines - - service_desk_enabled - import_jid - - cached_markdown_version - delete_error - last_repository_updated_at - disable_overriding_approvers_per_merge_request - storage_version - resolve_outdated_diff_discussions - remote_mirror_available_overridden - - only_mirror_protected_branches - pull_mirror_available_overridden - - jobs_cache_index - mirror_overwrites_diverged_branches - external_authorization_classification_label - - external_webhook_token subscriptions: whitelist: - id @@ -797,7 +757,6 @@ tables: - created_by_id - last_credential_check_at - avatar - - confirmation_token - confirmed_at - confirmation_sent_at - unconfirmed_email @@ -815,10 +774,8 @@ tables: - layout - hide_project_limit - note - - unlock_token - otp_grace_period_started_at - external - - incoming_email_token - organization - auditor - require_two_factor_authentication_from_group @@ -828,7 +785,6 @@ tables: - notified_of_own_activity - support_bot - preferred_language - - feed_token - theme_id pseudo: - id @@ -843,7 +799,6 @@ tables: - username - created_by_id - avatar - - confirmation_token - unconfirmed_email - hide_no_ssh_key - website_url @@ -853,11 +808,8 @@ tables: - consumed_timestep - hide_project_limit - note - - unlock_token - external - - incoming_email_token - organization - auditor - two_factor_grace_period - - feed_token - theme_id -- GitLab From f71fbe7aeb3605038e41e07ac458e5fe75997681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Mon, 18 Jun 2018 16:18:42 -0400 Subject: [PATCH 40/63] fix the anonymization with non-string data --- lib/pseudonymizer/dumper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index afdae77de236..0bd15a58285d 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -22,7 +22,7 @@ def anonymize(results) to_filter.each do |field| next if result[field].nil? - result[field] = OpenSSL::HMAC.hexdigest(digest, key, result[field]) + result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field])) end yielder << result end -- GitLab From d50c4078eb59c272f34fea030438c19ef53b44d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Tue, 19 Jun 2018 08:37:22 -0400 Subject: [PATCH 41/63] apply feedback from review --- lib/pseudonymizer/dumper.rb | 5 ++--- lib/pseudonymizer/options.rb | 2 +- lib/pseudonymizer/uploader.rb | 5 ++++- spec/lib/pseudonymizer/uploader_spec.rb | 8 ++++---- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 0bd15a58285d..6d001c14c7d8 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -64,8 +64,7 @@ def tables_to_csv private def output_filename(basename = nil, ext = "csv.gz") - file_timestamp = "#{basename}.#{ext}" - File.join(output_dir, file_timestamp) + File.join(output_dir, "#{basename}.#{ext}") end def schema_to_yml @@ -147,7 +146,7 @@ def set_schema_column_types(table, type_results) end def write_to_csv_file(table, contents) - file_path = output_filename(table, "csv.gz") + file_path = output_filename(table) Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}." Zlib::GzipWriter.open(file_path) do |io| diff --git a/lib/pseudonymizer/options.rb b/lib/pseudonymizer/options.rb index 990d01d0946b..255e64be6630 100644 --- a/lib/pseudonymizer/options.rb +++ b/lib/pseudonymizer/options.rb @@ -13,7 +13,7 @@ def initialize(config: {}, output_dir: nil) end def upload_dir - File.join(start_at.iso8601) + start_at.iso8601 end end end diff --git a/lib/pseudonymizer/uploader.rb b/lib/pseudonymizer/uploader.rb index e2ea01456c88..a5937a7ed3b8 100644 --- a/lib/pseudonymizer/uploader.rb +++ b/lib/pseudonymizer/uploader.rb @@ -34,7 +34,10 @@ def cleanup return unless File.exist?(@output_dir) progress_output.print "Deleting tmp directory #{@output_dir} ... " - progress_output.puts FileUtils.rm_rf(@output_dir) ? "done".color(:green) : "failed".color(:red) + FileUtils.rm_rf(@output_dir) + progress_output.puts "done".color(:green) + rescue + progress_output.puts "failed".color(:red) end private diff --git a/spec/lib/pseudonymizer/uploader_spec.rb b/spec/lib/pseudonymizer/uploader_spec.rb index 893556b64c34..0aa668048d5c 100644 --- a/spec/lib/pseudonymizer/uploader_spec.rb +++ b/spec/lib/pseudonymizer/uploader_spec.rb @@ -23,6 +23,10 @@ def mock_file(file_name) mock_file("file_list.json") end + after do + FileUtils.rm_rf(base_dir) + end + describe "#upload" do it "upload all file in the directory" do subject.upload @@ -38,8 +42,4 @@ def mock_file(file_name) expect(Dir[File.join(base_dir, "*")].length).to eq(0) end end - - after do - FileUtils.rm_rf(base_dir) - end end -- GitLab From e4e45f430d39479684f0eb89de555d2552a2009a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Tue, 19 Jun 2018 10:43:08 -0400 Subject: [PATCH 42/63] add documentation --- doc/administration/pseudonymizer.md | 55 +++++++++++++++++++++++++++++ doc/raketasks/pseudonymizer.md | 52 +++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 doc/administration/pseudonymizer.md create mode 100644 doc/raketasks/pseudonymizer.md diff --git a/doc/administration/pseudonymizer.md b/doc/administration/pseudonymizer.md new file mode 100644 index 000000000000..5c1694a2d1f2 --- /dev/null +++ b/doc/administration/pseudonymizer.md @@ -0,0 +1,55 @@ +# Pseudonymizer + +## Object Storage Settings + +**In Omnibus installations:** + +1. Edit `/etc/gitlab/gitlab.rb` and add the following lines by replacing with + the values you want: + + ```ruby + gitlab_rails['pseudonymizer_enabled'] = true + gitlab_rails['pseudonymizer_manifest'] = 'lib/pseudonymizer/manifest.yml' + gitlab_rails['pseudonymizer_upload_remote_directory'] = 'gitlab-elt' + gitlab_rails['pseudonymizer_upload_connection'] = { + 'provider' => 'AWS', + 'region' => 'eu-central-1', + 'aws_access_key_id' => 'AWS_ACCESS_KEY_ID', + 'aws_secret_access_key' => 'AWS_SECRET_ACCESS_KEY' + } + ``` + +>**Note:** +If you are using AWS IAM profiles, be sure to omit the AWS access key and secret access key/value pairs. + + ```ruby + gitlab_rails['pseudonymizer_upload_connection'] = { + 'provider' => 'AWS', + 'region' => 'eu-central-1', + 'use_iam_profile' => true + } + ``` + +1. Save the file and [reconfigure GitLab][] for the changes to take effect. + +--- + +**In installations from source:** + +1. Edit `/home/git/gitlab/config/gitlab.yml` and add or amend the following + lines: + + ```yaml + pseudonymizer: + enabled: true + manifest: lib/pseudonymizer/manifest.yml + upload: + remote_directory: 'gitlab-elt' # The bucket name + connection: + provider: AWS # Only AWS supported at the moment + aws_access_key_id: AWS_ACESS_KEY_ID + aws_secret_access_key: AWS_SECRET_ACCESS_KEY + region: eu-central-1 + ``` + +1. Save the file and [restart GitLab][] for the changes to take effect. diff --git a/doc/raketasks/pseudonymizer.md b/doc/raketasks/pseudonymizer.md new file mode 100644 index 000000000000..17b80b93477f --- /dev/null +++ b/doc/raketasks/pseudonymizer.md @@ -0,0 +1,52 @@ +# Pseudonymizer + +> [Introduced](https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/5532) in [GitLab Enterprise Edition][ee] 11.1 + +## Export GitLab's data for safe analytics + +As the GitLab's database host sensitive informations, using it unfiltered for analytics implies high security requirements. To help alleviate this constraint, the Pseudonymizer service shall export GitLab's data, in a pseudonymized way. + +### Pseudonymization + +> **Note:** +> This process is not impervious: if the source data is available, it is possible for an user to correlate data to the pseudonymized version. + +The Pseudonymizer currently uses `HMAC(SHA256)` to mutate fields that should not textually exported. This should ensure that: + + - End-user of the data source cannot infer/revert the pseudonymized fields + - Referencial integrity is maintained + +### Manifest + +The manifest is a file that describe which fields should be included or pseudonymized. + +You may find this manifest at `lib/pseudonymizer/manifest.yml`. + +### Usage + +> **Note:** +> You can configure the pseudonymizer using the following environment variables: +> +> - PSEUDONYMIZER_OUTPUT_DIR: where to store the output CSV files (default: `/tmp`) +> - PSEUDONYMIZER_BATCH: the batch size when querying the DB (default: `100 000`) + +> **Note:** +> Object store is required for the pseudonymizer to work properly. + +``` +bundle exec rake gitlab:db:pseudonymizer +``` + +### Output + +> **Note:** +> The output CSV files might be very large. Make sure the `PSEUDONYMIZER_OUTPUT_DIR` has sufficient space. As a rule of thumb, at least 10% of the database size is recommended. + +After the pseudonymizer has run, the output CSV files should be uploaded to the configured object store. + +### Configuration + +See [administration]. + +[ee]: https://about.gitlab.com/products/ +[administration]: administration/pseudonymizer.md -- GitLab From a30eeb1d0576e4c18aff5b741d3da2f4a7b9815f Mon Sep 17 00:00:00 2001 From: Achilleas Pipinellis Date: Tue, 19 Jun 2018 17:26:48 +0200 Subject: [PATCH 43/63] Copyedit Pseudonymizer docs --- doc/administration/index.md | 4 ++ doc/administration/pseudonymizer.md | 62 +++++++++++++++++++++++++---- doc/raketasks/pseudonymizer.md | 52 ------------------------ 3 files changed, 59 insertions(+), 59 deletions(-) delete mode 100644 doc/raketasks/pseudonymizer.md diff --git a/doc/administration/index.md b/doc/administration/index.md index 56bf666c5254..ab00548d9438 100644 --- a/doc/administration/index.md +++ b/doc/administration/index.md @@ -167,6 +167,10 @@ created in snippets, wikis, and repos. - [Request Profiling](monitoring/performance/request_profiling.md): Get a detailed profile on slow requests. - [Performance Bar](monitoring/performance/performance_bar.md): Get performance information for the current page. +## Analytics + +- [Pseudonymizer](pseudonymizer.md): Export data from GitLab's database to CSV files in a secure way. + ## Troubleshooting - [Debugging tips](troubleshooting/debug.md): Tips to debug problems when things go wrong diff --git a/doc/administration/pseudonymizer.md b/doc/administration/pseudonymizer.md index 5c1694a2d1f2..fe21025e3c20 100644 --- a/doc/administration/pseudonymizer.md +++ b/doc/administration/pseudonymizer.md @@ -1,8 +1,30 @@ # Pseudonymizer -## Object Storage Settings +> [Introduced](https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/5532) in [GitLab Ultimate][ee] 11.1. -**In Omnibus installations:** +As GitLab's database hosts sensitive information, using it unfiltered for analytics +implies high security requirements. To help alleviate this constraint, the Pseudonymizer +service is used to export GitLab's data in a pseudonymized way. + +CAUTION: **Warning:** +This process is not impervious. If the source data is available, it's possible for +a user to correlate data to the pseudonymized version. + +The Pseudonymizer currently uses `HMAC(SHA256)` to mutate fields that shouldn't +be textually exported. This ensures that: + +- the end-user of the data source cannot infer/revert the pseudonymized fields +- the referential integrity is maintained + +## Configuration + +To configure the pseudonymizer, you need to: + +- Provide a manifest file that describes which fields should be included or + pseudonymized ([example `manifest.yml` file]()). +- Use an object storage + +**For Omnibus installations:** 1. Edit `/etc/gitlab/gitlab.rb` and add the following lines by replacing with the values you want: @@ -19,8 +41,8 @@ } ``` ->**Note:** -If you are using AWS IAM profiles, be sure to omit the AWS access key and secret access key/value pairs. + NOTE: **Note:** + If you are using AWS IAM profiles, be sure to omit the AWS access key and secret access key/value pairs. ```ruby gitlab_rails['pseudonymizer_upload_connection'] = { @@ -30,11 +52,12 @@ If you are using AWS IAM profiles, be sure to omit the AWS access key and secret } ``` -1. Save the file and [reconfigure GitLab][] for the changes to take effect. +1. Save the file and [reconfigure GitLab](restart_gitlab.md#omnibus-gitlab-reconfigure) + for the changes to take effect. --- -**In installations from source:** +**For installations from source:** 1. Edit `/home/git/gitlab/config/gitlab.yml` and add or amend the following lines: @@ -52,4 +75,29 @@ If you are using AWS IAM profiles, be sure to omit the AWS access key and secret region: eu-central-1 ``` -1. Save the file and [restart GitLab][] for the changes to take effect. +1. Save the file and [restart GitLab](restart_gitlab.md#installations-from-source) + for the changes to take effect. + +## Usage + +You can optionally run the pseudonymizer using the following environment variables: + +- `PSEUDONYMIZER_OUTPUT_DIR` - where to store the output CSV files (defaults to `/tmp`) +- `PSEUDONYMIZER_BATCH` - the batch size when querying the DB (defaults to `100000`) + +```bash +## Omnibus +sudo gitlab-rake gitlab:db:pseudonymizer + +## Source +sudo -u git -H bundle exec rake gitlab:db:pseudonymizer RAILS_ENV=production +``` + +This will produce some CSV files that might be very large, so make sure the +`PSEUDONYMIZER_OUTPUT_DIR` has sufficient space. As a rule of thumb, at least +10% of the database size is recommended. + +After the pseudonymizer has run, the output CSV files should be uploaded to the +configured object storage. + +[ee]: https://about.gitlab.com/pricing/ diff --git a/doc/raketasks/pseudonymizer.md b/doc/raketasks/pseudonymizer.md deleted file mode 100644 index 17b80b93477f..000000000000 --- a/doc/raketasks/pseudonymizer.md +++ /dev/null @@ -1,52 +0,0 @@ -# Pseudonymizer - -> [Introduced](https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/5532) in [GitLab Enterprise Edition][ee] 11.1 - -## Export GitLab's data for safe analytics - -As the GitLab's database host sensitive informations, using it unfiltered for analytics implies high security requirements. To help alleviate this constraint, the Pseudonymizer service shall export GitLab's data, in a pseudonymized way. - -### Pseudonymization - -> **Note:** -> This process is not impervious: if the source data is available, it is possible for an user to correlate data to the pseudonymized version. - -The Pseudonymizer currently uses `HMAC(SHA256)` to mutate fields that should not textually exported. This should ensure that: - - - End-user of the data source cannot infer/revert the pseudonymized fields - - Referencial integrity is maintained - -### Manifest - -The manifest is a file that describe which fields should be included or pseudonymized. - -You may find this manifest at `lib/pseudonymizer/manifest.yml`. - -### Usage - -> **Note:** -> You can configure the pseudonymizer using the following environment variables: -> -> - PSEUDONYMIZER_OUTPUT_DIR: where to store the output CSV files (default: `/tmp`) -> - PSEUDONYMIZER_BATCH: the batch size when querying the DB (default: `100 000`) - -> **Note:** -> Object store is required for the pseudonymizer to work properly. - -``` -bundle exec rake gitlab:db:pseudonymizer -``` - -### Output - -> **Note:** -> The output CSV files might be very large. Make sure the `PSEUDONYMIZER_OUTPUT_DIR` has sufficient space. As a rule of thumb, at least 10% of the database size is recommended. - -After the pseudonymizer has run, the output CSV files should be uploaded to the configured object store. - -### Configuration - -See [administration]. - -[ee]: https://about.gitlab.com/products/ -[administration]: administration/pseudonymizer.md -- GitLab From ed3449b54e99a8fa94cb8a35ab7726f94ef3746c Mon Sep 17 00:00:00 2001 From: Achilleas Pipinellis Date: Tue, 19 Jun 2018 17:39:21 +0200 Subject: [PATCH 44/63] Add that files are deleted from local disk after they're uploaded --- doc/administration/pseudonymizer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/administration/pseudonymizer.md b/doc/administration/pseudonymizer.md index fe21025e3c20..8e767ba4ffb6 100644 --- a/doc/administration/pseudonymizer.md +++ b/doc/administration/pseudonymizer.md @@ -98,6 +98,6 @@ This will produce some CSV files that might be very large, so make sure the 10% of the database size is recommended. After the pseudonymizer has run, the output CSV files should be uploaded to the -configured object storage. +configured object storage and deleted from the local disk. [ee]: https://about.gitlab.com/pricing/ -- GitLab From 679240a1e42f1344d9ba1ce1597618e108880f2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Tue, 19 Jun 2018 11:57:23 -0400 Subject: [PATCH 45/63] apply feedback --- lib/pseudonymizer/dumper.rb | 39 +++++++++++---------- spec/lib/pseudonymizer/dumper_spec.rb | 8 ++--- spec/support/helpers/stub_object_storage.rb | 2 +- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 6d001c14c7d8..8f9625cf6658 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -34,7 +34,7 @@ class Dumper attr_accessor :config, :output_dir def initialize(options) - @config = options.config + @config = options.config.deep_symbolize_keys @output_dir = options.output_dir @start_at = options.start_at @@ -49,15 +49,15 @@ def reset! def tables_to_csv reset! - tables = config["tables"] + tables = config[:tables] FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir) schema_to_yml @output_files = tables.map do |k, v| - table_to_csv(k, v['whitelist'], v['pseudo']) - end - file_list_to_json + table_to_csv(k, v[:whitelist], v[:pseudo]) + end.compact + file_list_to_json @output_files end @@ -69,15 +69,13 @@ def output_filename(basename = nil, ext = "csv.gz") def schema_to_yml file_path = output_filename("schema", "yml") - File.open(file_path, 'w') { |file| file.write(@schema.to_yaml) } + File.write(file_path, @schema.to_yaml) end def file_list_to_json file_path = output_filename("file_list", "json") - File.open(file_path, 'w') do |file| - relative_files = @output_files.map(&File.method(:basename)) - file.write(relative_files.to_json) - end + relative_files = @output_files.map(&File.method(:basename)) + File.write(file_path, relative_files.to_json) end def table_to_csv(table, whitelist_columns, pseudonymity_columns) @@ -120,20 +118,24 @@ def table_page_results(table, whitelist_columns, pseudonymity_columns) end def table_to_schema(table) + whitelisted = ->(table) { @config.dig(:tables, table, :whitelist) } + pseudonymized = ->(table) { @config.dig(:tables, table, :pseudo) } + type_results = ActiveRecord::Base.connection.columns(table) type_results = type_results.select do |c| - @config["tables"][table]["whitelist"].include?(c.name) + whitelisted[table].include?(c.name) end type_results = type_results.map do |c| data_type = c.sql_type - if @config["tables"][table]["pseudo"].include?(c.name) + if pseudonymized[table].include?(c.name) data_type = "character varying" end { name: c.name, data_type: data_type } end + set_schema_column_types(table, type_results) end @@ -141,24 +143,25 @@ def set_schema_column_types(table, type_results) type_results.each do |type_result| @schema[table][type_result[:name]] = type_result[:data_type] end + # hard coded because all mapping keys in GL are id @schema[table]["gl_mapping_key"] = "id" end def write_to_csv_file(table, contents) file_path = output_filename(table) + headers = contents.peek.keys Rails.logger.info "#{self.class.name} writing #{table} to #{file_path}." Zlib::GzipWriter.open(file_path) do |io| - csv = CSV.new(io) - contents.with_index do |row, i| - csv << row.keys if i == 0 # header - csv << row.values - end - csv.close + csv = CSV.new(io, headers: headers, write_headers: true) + contents.each { |row| csv << row.values } end file_path + rescue StopIteration + Rails.logger.info "#{self.class.name} table #{table} is empty." + nil end end end diff --git a/spec/lib/pseudonymizer/dumper_spec.rb b/spec/lib/pseudonymizer/dumper_spec.rb index e6163bb49a2a..b847b9d2f18f 100644 --- a/spec/lib/pseudonymizer/dumper_spec.rb +++ b/spec/lib/pseudonymizer/dumper_spec.rb @@ -21,10 +21,10 @@ describe 'Pseudo tables' do it 'outputs project tables to csv' do column_names = %w(id name path description) - pseudo.config["tables"] = { - "projects" => { - "whitelist" => column_names, - "pseudo" => %w(id) + pseudo.config[:tables] = { + projects: { + whitelist: column_names, + pseudo: %w(id) } } diff --git a/spec/support/helpers/stub_object_storage.rb b/spec/support/helpers/stub_object_storage.rb index e757f32aecb5..b1e2b5365b73 100644 --- a/spec/support/helpers/stub_object_storage.rb +++ b/spec/support/helpers/stub_object_storage.rb @@ -65,6 +65,6 @@ def stub_object_storage_multipart_init(endpoint, upload_id = "upload_id") def stub_object_storage_pseudonymizer stub_object_storage(connection_params: Pseudonymizer::Uploader.object_store_credentials, - remote_directory: Gitlab.config.pseudonymizer.upload.remote_directory) + remote_directory: Pseudonymizer::Uploader.remote_directory) end end -- GitLab From 184f612d02bf4988e8eab91ff233502673d55d83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Tue, 19 Jun 2018 17:47:41 -0400 Subject: [PATCH 46/63] apply feedback --- app/workers/pseudonymizer_worker.rb | 2 +- config/initializers/1_settings.rb | 2 +- db/schema.rb | 2 +- doc/administration/pseudonymizer.md | 2 +- ee/app/models/license.rb | 2 +- ...nymizer_enabled_to_application_settings.rb | 2 +- lib/pseudonymizer/dumper.rb | 84 +++++++++++++++---- lib/pseudonymizer/manifest.yml | 28 ------- lib/tasks/gitlab/db.rake | 6 +- spec/lib/pseudonymizer/dumper_spec.rb | 2 +- spec/lib/pseudonymizer/uploader_spec.rb | 2 +- 11 files changed, 78 insertions(+), 56 deletions(-) diff --git a/app/workers/pseudonymizer_worker.rb b/app/workers/pseudonymizer_worker.rb index 389879eaf4ce..3030fc8c7cbf 100644 --- a/app/workers/pseudonymizer_worker.rb +++ b/app/workers/pseudonymizer_worker.rb @@ -6,7 +6,7 @@ def perform return unless Gitlab::CurrentSettings.pseudonymizer_enabled? options = Pseudonymizer::Options.new( - config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), + config: YAML.load_file(Gitlab.config.pseudonymizer.manifest), output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR'] ) diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index f9df0f4aaaa7..a307d1e49229 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -479,7 +479,7 @@ # Settings['pseudonymizer'] ||= Settingslogic.new({}) Settings.pseudonymizer['enabled'] = false if Settings.pseudonymizer['enabled'].nil? -Settings.pseudonymizer['manifest'] = Settings.pseudonymizer['manifest'] || "lib/pseudonymizer/manifest.yml" +Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("lib/pseudonymizer/manifest.yml")) Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil }) # Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600 diff --git a/db/schema.rb b/db/schema.rb index 224e1be0bcb2..b153f7baad52 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -206,7 +206,7 @@ t.string "encrypted_external_auth_client_key_pass_iv" t.string "email_additional_text" t.boolean "enforce_terms", default: false - t.boolean "pseudonymizer_enabled" + t.boolean "pseudonymizer_enabled", default: false, null: false end create_table "approvals", force: :cascade do |t| diff --git a/doc/administration/pseudonymizer.md b/doc/administration/pseudonymizer.md index 8e767ba4ffb6..a7522849e55e 100644 --- a/doc/administration/pseudonymizer.md +++ b/doc/administration/pseudonymizer.md @@ -70,7 +70,7 @@ To configure the pseudonymizer, you need to: remote_directory: 'gitlab-elt' # The bucket name connection: provider: AWS # Only AWS supported at the moment - aws_access_key_id: AWS_ACESS_KEY_ID + aws_access_key_id: AWS_ACCESS_KEY_ID aws_secret_access_key: AWS_SECRET_ACCESS_KEY region: eu-central-1 ``` diff --git a/ee/app/models/license.rb b/ee/app/models/license.rb index 5e327b373c31..8eb960960405 100644 --- a/ee/app/models/license.rb +++ b/ee/app/models/license.rb @@ -31,7 +31,6 @@ class License < ActiveRecord::Base repository_mirrors repository_size_limit scoped_issue_board - pseudonymizer ].freeze EEP_FEATURES = EES_FEATURES + %i[ @@ -74,6 +73,7 @@ class License < ActiveRecord::Base ide chatops pod_logs + pseudonymizer ].freeze # List all features available for early adopters, diff --git a/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb b/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb index eba6930ba0ea..7517e78a6187 100644 --- a/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb +++ b/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb @@ -26,6 +26,6 @@ class AddPseudonymizerEnabledToApplicationSettings < ActiveRecord::Migration # disable_ddl_transaction! def change - add_column :application_settings, :pseudonymizer_enabled, :boolean + add_column :application_settings, :pseudonymizer_enabled, :boolean, null: false, default: false end end diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 8f9625cf6658..6babfda1a58f 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -4,7 +4,69 @@ require 'yaml' module Pseudonymizer - PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000) + class Pager + PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000) + + def initialize(table, columns) + @table = table + @columns = columns + end + + def pages(&block) + if @columns.include?("id") + # optimize the pagination using WHERE id > ? + pages_per_id(&block) + else + # fallback to `LIMIT ? OFFSET ?` when "id" is unavailable + pages_per_offset(&block) + end + end + + def pages_per_id(&block) + id_offset = 0 + + loop do + # a page of results + results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish) + SELECT #{@columns.join(",")} + FROM #{@table} + WHERE id > #{id_offset} + ORDER BY id + LIMIT #{PAGE_SIZE} + SQL + Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}, +#{PAGE_SIZE}[") + break if results.empty? + + id_offset = results.last["id"].to_i + yield results + + break if results.count < PAGE_SIZE + end + end + + def pages_per_offset(&block) + page = 0 + + loop do + offset = page * PAGE_SIZE + + # a page of results + results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish) + SELECT #{@columns.join(",")} + FROM #{@table} + ORDER BY #{@columns.join(",")} + LIMIT #{PAGE_SIZE} OFFSET #{offset} + SQL + Rails.logger.debug("#{self.class.name} fetching offset [#{offset}, #{offset + PAGE_SIZE}[") + break if results.empty? + + page += 1 + yield results + + break if results.count < PAGE_SIZE + end + end + end class Anon def initialize(fields) @@ -47,7 +109,7 @@ def reset! end def tables_to_csv - reset! + return @output_files if @output_files tables = config[:tables] FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir) @@ -94,25 +156,13 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) # yield every results, pagined, anonymized def table_page_results(table, whitelist_columns, pseudonymity_columns) anonymizer = Anon.new(pseudonymity_columns) - page = 0 + pager = Pager.new(table, whitelist_columns) Enumerator.new do |yielder| - loop do - offset = page * PAGE_SIZE - has_more = false - - sql = "SELECT #{whitelist_columns.join(",")} FROM #{table} LIMIT #{PAGE_SIZE} OFFSET #{offset}" - - # a page of results - results = ActiveRecord::Base.connection.exec_query(sql) - anonymizer.anonymize(results).each do |result| - has_more = true + pager.pages do |page| + anonymizer.anonymize(page).each do |result| yielder << result end - - raise StopIteration unless has_more - - page += 1 end end.lazy end diff --git a/lib/pseudonymizer/manifest.yml b/lib/pseudonymizer/manifest.yml index 4c48e1c8ae6a..43a09115ed65 100644 --- a/lib/pseudonymizer/manifest.yml +++ b/lib/pseudonymizer/manifest.yml @@ -209,34 +209,6 @@ tables: - updated_at pseudo: - id - merge_request_diff_commits: - whitelist: - - authored_date - - committed_date - - merge_request_diff_id - - relative_order - - author_name - - author_email - - committer_name - - committer_email - pseudo: - - merge_request_diff_id - - author_name - - author_email - - committer_name - - committer_email - merge_request_diff_files: - whitelist: - - merge_request_diff_id - - relative_order - - new_file - - renamed_file - - deleted_file - - too_large - - a_mode - - b_mode - pseudo: - - merge_request_diff_id merge_request_diffs: whitelist: - id diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index 3c39be2afdb9..e5c049458061 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -72,11 +72,11 @@ namespace :gitlab do desc 'Output pseudonymity dump of selected tables' task pseudonymizer: :environment do - abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) - abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? + # abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) + # abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? options = Pseudonymizer::Options.new( - config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)), + config: YAML.load_file(Gitlab.config.pseudonymizer.manifest), output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR'] ) diff --git a/spec/lib/pseudonymizer/dumper_spec.rb b/spec/lib/pseudonymizer/dumper_spec.rb index b847b9d2f18f..cdd8dad06158 100644 --- a/spec/lib/pseudonymizer/dumper_spec.rb +++ b/spec/lib/pseudonymizer/dumper_spec.rb @@ -5,7 +5,7 @@ let(:base_dir) { Dir.mktmpdir } let(:options) do Pseudonymizer::Options.new( - config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) + config: YAML.load_file(Gitlab.config.pseudonymizer.manifest) ) end subject(:pseudo) { described_class.new(options) } diff --git a/spec/lib/pseudonymizer/uploader_spec.rb b/spec/lib/pseudonymizer/uploader_spec.rb index 0aa668048d5c..6da78267868c 100644 --- a/spec/lib/pseudonymizer/uploader_spec.rb +++ b/spec/lib/pseudonymizer/uploader_spec.rb @@ -4,7 +4,7 @@ let(:base_dir) { Dir.mktmpdir } let(:options) do Pseudonymizer::Options.new( - config: YAML.load_file(Rails.root.join(Gitlab.config.pseudonymizer.manifest)) + config: YAML.load_file(Gitlab.config.pseudonymizer.manifest) ) end let(:remote_directory) { subject.send(:remote_directory) } -- GitLab From 1c16cdec094fffccffe808326bb6149ea2f9bb53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Tue, 19 Jun 2018 17:59:21 -0400 Subject: [PATCH 47/63] fix a problem with table_to_csv caching --- .../views/admin/application_settings/_pseudonymizer.html.haml | 4 +--- lib/pseudonymizer/dumper.rb | 2 +- lib/tasks/gitlab/db.rake | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) rename {app => ee/app}/views/admin/application_settings/_pseudonymizer.html.haml (81%) diff --git a/app/views/admin/application_settings/_pseudonymizer.html.haml b/ee/app/views/admin/application_settings/_pseudonymizer.html.haml similarity index 81% rename from app/views/admin/application_settings/_pseudonymizer.html.haml rename to ee/app/views/admin/application_settings/_pseudonymizer.html.haml index 429ee0e5eb75..3f732b190252 100644 --- a/app/views/admin/application_settings/_pseudonymizer.html.haml +++ b/ee/app/views/admin/application_settings/_pseudonymizer.html.haml @@ -5,10 +5,9 @@ .form-group.row .offset-sm-2.col-sm-10 - is_enabled = @application_setting.pseudonymizer_enabled? - - is_available = @application_setting.pseudonymizer_available? .form-check = f.label :pseudonymizer_enabled do - = f.check_box :pseudonymizer_enabled, disabled: !is_available + = f.check_box :pseudonymizer_enabled Enable Pseudonymizer Cron Job .form-text.text-muted - if is_enabled @@ -17,4 +16,3 @@ = pseudonymizer_disabled_description_text = f.submit 'Save changes', class: "btn btn-success" - diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 6babfda1a58f..849006ffb0f5 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -109,7 +109,7 @@ def reset! end def tables_to_csv - return @output_files if @output_files + return @output_files unless @output_files.empty? tables = config[:tables] FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir) diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index e5c049458061..30360b344373 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -72,8 +72,8 @@ namespace :gitlab do desc 'Output pseudonymity dump of selected tables' task pseudonymizer: :environment do - # abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) - # abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? + abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) + abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? options = Pseudonymizer::Options.new( config: YAML.load_file(Gitlab.config.pseudonymizer.manifest), -- GitLab From 7525a79f50c509d95e953355cecfcf79a0761282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Wed, 20 Jun 2018 11:00:01 -0400 Subject: [PATCH 48/63] apply feedback --- config/gitlab.yml.example | 4 +-- config/initializers/1_settings.rb | 2 +- .../manifest.yml => config/pseudonymizer.yml | 4 +-- doc/administration/pseudonymizer.md | 6 ++-- .../helpers/ee/application_settings_helper.rb | 6 +++- .../_pseudonymizer.html.haml | 2 +- lib/pseudonymizer/dumper.rb | 29 ++++++++++++------- lib/pseudonymizer/options.rb | 8 ++++- spec/lib/pseudonymizer/dumper_spec.rb | 16 +++++++++- 9 files changed, 55 insertions(+), 22 deletions(-) rename lib/pseudonymizer/manifest.yml => config/pseudonymizer.yml (99%) diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index edd2114bf4b6..052f3908658e 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -734,7 +734,7 @@ production: &base pseudonymizer: enabled: false # Tables manifest that specifies the fields to extract and pseudonymize. - manifest: lib/pseudonymizer/manifest.yml + manifest: config/pseudonymizer.yml upload: # remote_directory: 'gitlab-elt' # Fog storage connection settings, see http://fog.io/storage/ . @@ -897,7 +897,7 @@ test: path: tmp/tests/backups pseudonymizer: enabled: false - manifest: lib/pseudonymizer/manifest.yml + manifest: config/pseudonymizer.yml upload: # The remote 'directory' to store the CSV files. For S3, this would be the bucket name. remote_directory: gitlab-elt.test diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index a307d1e49229..a99a6c14e9cf 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -479,7 +479,7 @@ # Settings['pseudonymizer'] ||= Settingslogic.new({}) Settings.pseudonymizer['enabled'] = false if Settings.pseudonymizer['enabled'].nil? -Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("lib/pseudonymizer/manifest.yml")) +Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("config/pseudonymizer.yml")) Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil }) # Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600 diff --git a/lib/pseudonymizer/manifest.yml b/config/pseudonymizer.yml similarity index 99% rename from lib/pseudonymizer/manifest.yml rename to config/pseudonymizer.yml index 43a09115ed65..56458f5c790b 100644 --- a/lib/pseudonymizer/manifest.yml +++ b/config/pseudonymizer.yml @@ -98,6 +98,8 @@ tables: - iid - updated_by_id - last_edited_by_id + - title + - description issue_assignees: whitelist: - user_id @@ -617,7 +619,6 @@ tables: - has_external_wiki - ci_config_path - lfs_enabled - - description_html - only_allow_merge_if_all_discussions_are_resolved - repository_size_limit - printing_merge_request_link_enabled @@ -670,7 +671,6 @@ tables: - repository_storage - repository_read_only - ci_config_path - - description_html - only_allow_merge_if_all_discussions_are_resolved - repository_size_limit - auto_cancel_pending_pipelines diff --git a/doc/administration/pseudonymizer.md b/doc/administration/pseudonymizer.md index a7522849e55e..8555e09df93f 100644 --- a/doc/administration/pseudonymizer.md +++ b/doc/administration/pseudonymizer.md @@ -21,7 +21,7 @@ be textually exported. This ensures that: To configure the pseudonymizer, you need to: - Provide a manifest file that describes which fields should be included or - pseudonymized ([example `manifest.yml` file]()). + pseudonymized ([example `manifest.yml` file](https://gitlab.com/gitlab-org/gitlab-ee/tree/master/config/pseudonymizer.yml)). - Use an object storage **For Omnibus installations:** @@ -31,7 +31,7 @@ To configure the pseudonymizer, you need to: ```ruby gitlab_rails['pseudonymizer_enabled'] = true - gitlab_rails['pseudonymizer_manifest'] = 'lib/pseudonymizer/manifest.yml' + gitlab_rails['pseudonymizer_manifest'] = 'config/pseudonymizer.yml' gitlab_rails['pseudonymizer_upload_remote_directory'] = 'gitlab-elt' gitlab_rails['pseudonymizer_upload_connection'] = { 'provider' => 'AWS', @@ -65,7 +65,7 @@ To configure the pseudonymizer, you need to: ```yaml pseudonymizer: enabled: true - manifest: lib/pseudonymizer/manifest.yml + manifest: config/pseudonymizer.yml upload: remote_directory: 'gitlab-elt' # The bucket name connection: diff --git a/ee/app/helpers/ee/application_settings_helper.rb b/ee/app/helpers/ee/application_settings_helper.rb index 211271323ccc..c464c7ca095b 100644 --- a/ee/app/helpers/ee/application_settings_helper.rb +++ b/ee/app/helpers/ee/application_settings_helper.rb @@ -35,8 +35,12 @@ def external_authorization_client_pass_help_text "and the value is encrypted at rest.") end + def pseudonymizer_enabled_help_text + _("Enable Pseudonymizer data export") + end + def pseudonymizer_description_text - _("GitLab will run the pseudonymizer cron job which will send pseudoanonymized data to be processed and analyzed.") + _("GitLab will run the pseudonymizer cron job which will output pseudoanonymized data to be processed and analyzed.") end def pseudonymizer_disabled_description_text diff --git a/ee/app/views/admin/application_settings/_pseudonymizer.html.haml b/ee/app/views/admin/application_settings/_pseudonymizer.html.haml index 3f732b190252..483ecb7707cd 100644 --- a/ee/app/views/admin/application_settings/_pseudonymizer.html.haml +++ b/ee/app/views/admin/application_settings/_pseudonymizer.html.haml @@ -8,7 +8,7 @@ .form-check = f.label :pseudonymizer_enabled do = f.check_box :pseudonymizer_enabled - Enable Pseudonymizer Cron Job + = pseudonymizer_enabled_help_text .form-text.text-muted - if is_enabled = pseudonymizer_description_text diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 849006ffb0f5..b1bba435935f 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -69,19 +69,18 @@ def pages_per_offset(&block) end class Anon - def initialize(fields) - @anon_fields = fields + def initialize(table, whitelisted_fields, pseudonymized_fields) + @table = table + @pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields) end def anonymize(results) - columns = results.columns # Assume they all have the same table - to_filter = @anon_fields & columns key = Rails.application.secrets[:secret_key_base] digest = OpenSSL::Digest.new('sha256') Enumerator.new do |yielder| results.each do |result| - to_filter.each do |field| + @pseudo_fields.each do |field| next if result[field].nil? result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field])) @@ -90,6 +89,17 @@ def anonymize(results) end end end + + private + + def pseudo_fields(whitelisted, pseudonymized) + pseudo_extra_fields = pseudonymized - whitelisted + pseudo_extra_fields.each do |field| + Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.") + end + + pseudonymized & whitelisted + end end class Dumper @@ -155,7 +165,7 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) # yield every results, pagined, anonymized def table_page_results(table, whitelist_columns, pseudonymity_columns) - anonymizer = Anon.new(pseudonymity_columns) + anonymizer = Anon.new(table, whitelist_columns, pseudonymity_columns) pager = Pager.new(table, whitelist_columns) Enumerator.new do |yielder| @@ -168,18 +178,17 @@ def table_page_results(table, whitelist_columns, pseudonymity_columns) end def table_to_schema(table) - whitelisted = ->(table) { @config.dig(:tables, table, :whitelist) } - pseudonymized = ->(table) { @config.dig(:tables, table, :pseudo) } + table_config = @config.dig(:tables, table) type_results = ActiveRecord::Base.connection.columns(table) type_results = type_results.select do |c| - whitelisted[table].include?(c.name) + table_config[:whitelist].include?(c.name) end type_results = type_results.map do |c| data_type = c.sql_type - if pseudonymized[table].include?(c.name) + if table_config[:pseudo].include?(c.name) data_type = "character varying" end diff --git a/lib/pseudonymizer/options.rb b/lib/pseudonymizer/options.rb index 255e64be6630..e3cbc1b8fbf6 100644 --- a/lib/pseudonymizer/options.rb +++ b/lib/pseudonymizer/options.rb @@ -9,10 +9,16 @@ def initialize(config: {}, output_dir: nil) @start_at = Time.now.utc base_dir = output_dir || File.join(Dir.tmpdir, 'gitlab-pseudonymizer') - @output_dir = File.join(base_dir, start_at.iso8601) + @output_dir = File.join(base_dir, batch_dir) end def upload_dir + batch_dir + end + + private + + def batch_dir start_at.iso8601 end end diff --git a/spec/lib/pseudonymizer/dumper_spec.rb b/spec/lib/pseudonymizer/dumper_spec.rb index cdd8dad06158..b3d54208d678 100644 --- a/spec/lib/pseudonymizer/dumper_spec.rb +++ b/spec/lib/pseudonymizer/dumper_spec.rb @@ -32,7 +32,7 @@ # grab the first table it outputs. There would only be 1. project_table_file = pseudo.tables_to_csv[0] - expect(project_table_file).to include("projects.csv.gz") + expect(project_table_file).to end_with("projects.csv.gz") columns = [] project_data = [] @@ -50,6 +50,20 @@ # sha 256 is 64 chars in length expect(project_data["id"].length).to eq(64) end + + it "warns when pseudonymized fields are extraneous" do + column_names = %w(id name path description) + pseudo.config[:tables] = { + projects: { + whitelist: column_names, + pseudo: %w(id extraneous) + } + } + + expect(Rails.logger).to receive(:warn).with(/extraneous/) + + pseudo.tables_to_csv + end end describe "manifest is valid" do -- GitLab From d7584f3263f32af677cc58f5b2375e23d893e1ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Wed, 20 Jun 2018 11:04:09 -0400 Subject: [PATCH 49/63] ensure the cleanup is run if the upload fails --- app/workers/pseudonymizer_worker.rb | 12 ++++++++---- lib/tasks/gitlab/db.rake | 11 +++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/app/workers/pseudonymizer_worker.rb b/app/workers/pseudonymizer_worker.rb index 3030fc8c7cbf..cf1fbf8961ce 100644 --- a/app/workers/pseudonymizer_worker.rb +++ b/app/workers/pseudonymizer_worker.rb @@ -3,6 +3,7 @@ class PseudonymizerWorker include CronjobQueue def perform + abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) return unless Gitlab::CurrentSettings.pseudonymizer_enabled? options = Pseudonymizer::Options.new( @@ -11,10 +12,13 @@ def perform ) dumper = Pseudonymizer::Dumper.new(options) - dumper.tables_to_csv - uploader = Pseudonymizer::Uploader.new(options, progress_output: File.open(File::NULL, "w")) - uploader.upload - uploader.cleanup + + begin + dumper.tables_to_csv + uploader.upload + ensure + uploader.cleanup + end end end diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index 30360b344373..e0d7e8bc892d 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -81,11 +81,14 @@ namespace :gitlab do ) dumper = Pseudonymizer::Dumper.new(options) - dumper.tables_to_csv - uploader = Pseudonymizer::Uploader.new(options) - uploader.upload - uploader.cleanup + + begin + dumper.tables_to_csv + uploader.upload + ensure + uploader.cleanup + end end end end -- GitLab From dc700967e656614e2eaeb9dc86e226dc7c9087cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Wed, 20 Jun 2018 11:09:40 -0400 Subject: [PATCH 50/63] move the application settings to the `EE-Specific` region --- .../admin/application_settings/show.html.haml | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/app/views/admin/application_settings/show.html.haml b/app/views/admin/application_settings/show.html.haml index 05e02e70a947..fef287f88862 100644 --- a/app/views/admin/application_settings/show.html.haml +++ b/app/views/admin/application_settings/show.html.haml @@ -237,18 +237,6 @@ .settings-content = render 'usage' -- if Gitlab::CurrentSettings.pseudonymizer_can_be_configured? - %section.settings.as-pseudonymizer.no-animate#js-pseudonymizer-settings{ class: ('expanded' if expanded) } - .settings-header - %h4 - = _('Pseudonymizer Cron Job') - %button.btn.btn-default.js-settings-toggle{ type: 'button' } - = expanded ? _('Collapse') : _('Expand') - %p - = _('Enable or disable the Pseudonymizer Cron Job.') - .settings-content - = render 'pseudonymizer' - %section.settings.as-email.no-animate#js-email-settings{ class: ('expanded' if expanded) } .settings-header %h4 @@ -374,6 +362,18 @@ .settings-content = render partial: 'elasticsearch_form' +- if Gitlab::CurrentSettings.pseudonymizer_can_be_configured? + %section.settings.as-pseudonymizer.no-animate#js-pseudonymizer-settings{ class: ('expanded' if expanded) } + .settings-header + %h4 + = _('Pseudonymizer data collection') + %button.btn.btn-default.js-settings-toggle{ type: 'button' } + = expanded ? _('Collapse') : _('Expand') + %p + = _('Enable or disable the Pseudonymizer data collection.') + .settings-content + = render 'pseudonymizer' + - if Gitlab.com? || Rails.env.development? %section.settings.as-slack.no-animate#js-slack-settings{ class: ('expanded' if expanded) } .settings-header -- GitLab From 879c65865ded92aa2e0dca9718eca0014c131f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Wed, 20 Jun 2018 12:14:09 -0400 Subject: [PATCH 51/63] add specs for `Pseudonymizer::Pager` --- lib/pseudonymizer/dumper.rb | 107 +-------------------------- lib/pseudonymizer/filter.rb | 38 ++++++++++ lib/pseudonymizer/pager.rb | 63 ++++++++++++++++ spec/lib/pseudonymizer/pager_spec.rb | 62 ++++++++++++++++ 4 files changed, 165 insertions(+), 105 deletions(-) create mode 100644 lib/pseudonymizer/filter.rb create mode 100644 lib/pseudonymizer/pager.rb create mode 100644 spec/lib/pseudonymizer/pager_spec.rb diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index b1bba435935f..59593119aaa3 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -1,107 +1,4 @@ -require 'openssl' -require 'digest' -require 'csv' -require 'yaml' - module Pseudonymizer - class Pager - PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000) - - def initialize(table, columns) - @table = table - @columns = columns - end - - def pages(&block) - if @columns.include?("id") - # optimize the pagination using WHERE id > ? - pages_per_id(&block) - else - # fallback to `LIMIT ? OFFSET ?` when "id" is unavailable - pages_per_offset(&block) - end - end - - def pages_per_id(&block) - id_offset = 0 - - loop do - # a page of results - results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish) - SELECT #{@columns.join(",")} - FROM #{@table} - WHERE id > #{id_offset} - ORDER BY id - LIMIT #{PAGE_SIZE} - SQL - Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}, +#{PAGE_SIZE}[") - break if results.empty? - - id_offset = results.last["id"].to_i - yield results - - break if results.count < PAGE_SIZE - end - end - - def pages_per_offset(&block) - page = 0 - - loop do - offset = page * PAGE_SIZE - - # a page of results - results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish) - SELECT #{@columns.join(",")} - FROM #{@table} - ORDER BY #{@columns.join(",")} - LIMIT #{PAGE_SIZE} OFFSET #{offset} - SQL - Rails.logger.debug("#{self.class.name} fetching offset [#{offset}, #{offset + PAGE_SIZE}[") - break if results.empty? - - page += 1 - yield results - - break if results.count < PAGE_SIZE - end - end - end - - class Anon - def initialize(table, whitelisted_fields, pseudonymized_fields) - @table = table - @pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields) - end - - def anonymize(results) - key = Rails.application.secrets[:secret_key_base] - digest = OpenSSL::Digest.new('sha256') - - Enumerator.new do |yielder| - results.each do |result| - @pseudo_fields.each do |field| - next if result[field].nil? - - result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field])) - end - yielder << result - end - end - end - - private - - def pseudo_fields(whitelisted, pseudonymized) - pseudo_extra_fields = pseudonymized - whitelisted - pseudo_extra_fields.each do |field| - Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.") - end - - pseudonymized & whitelisted - end - end - class Dumper attr_accessor :config, :output_dir @@ -165,12 +62,12 @@ def table_to_csv(table, whitelist_columns, pseudonymity_columns) # yield every results, pagined, anonymized def table_page_results(table, whitelist_columns, pseudonymity_columns) - anonymizer = Anon.new(table, whitelist_columns, pseudonymity_columns) + filter = Filter.new(table, whitelist_columns, pseudonymity_columns) pager = Pager.new(table, whitelist_columns) Enumerator.new do |yielder| pager.pages do |page| - anonymizer.anonymize(page).each do |result| + filter.anonymize(page).each do |result| yielder << result end end diff --git a/lib/pseudonymizer/filter.rb b/lib/pseudonymizer/filter.rb new file mode 100644 index 000000000000..5c41b3413b91 --- /dev/null +++ b/lib/pseudonymizer/filter.rb @@ -0,0 +1,38 @@ +require 'openssl' +require 'digest' + +module Pseudonymizer + class Filter + def initialize(table, whitelisted_fields, pseudonymized_fields) + @table = table + @pseudo_fields = pseudo_fields(whitelisted_fields, pseudonymized_fields) + end + + def anonymize(results) + key = Rails.application.secrets[:secret_key_base] + digest = OpenSSL::Digest.new('sha256') + + Enumerator.new do |yielder| + results.each do |result| + @pseudo_fields.each do |field| + next if result[field].nil? + + result[field] = OpenSSL::HMAC.hexdigest(digest, key, String(result[field])) + end + yielder << result + end + end + end + + private + + def pseudo_fields(whitelisted, pseudonymized) + pseudo_extra_fields = pseudonymized - whitelisted + pseudo_extra_fields.each do |field| + Rails.logger.warn("#{self.class.name} extraneous pseudo: #{@table}.#{field} is not whitelisted and will be ignored.") + end + + pseudonymized & whitelisted + end + end +end diff --git a/lib/pseudonymizer/pager.rb b/lib/pseudonymizer/pager.rb new file mode 100644 index 000000000000..ab0e066c621b --- /dev/null +++ b/lib/pseudonymizer/pager.rb @@ -0,0 +1,63 @@ +module Pseudonymizer + class Pager + PAGE_SIZE = ENV.fetch('PSEUDONYMIZER_BATCH', 100_000) + + def initialize(table, columns) + @table = table + @columns = columns + end + + def pages(&block) + if @columns.include?("id") + # optimize the pagination using WHERE id > ? + pages_per_id(&block) + else + # fallback to `LIMIT ? OFFSET ?` when "id" is unavailable + pages_per_offset(&block) + end + end + + def pages_per_id(&block) + id_offset = 0 + + loop do + # a page of results + results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish) + SELECT #{@columns.join(",")} + FROM #{@table} + WHERE id > #{id_offset} + ORDER BY id + LIMIT #{PAGE_SIZE} + SQL + Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}, +#{PAGE_SIZE}[") + break if results.empty? + + id_offset = results.last["id"].to_i + yield results + + break if results.count < PAGE_SIZE + end + end + + def pages_per_offset(&block) + offset = 0 + + loop do + # a page of results + results = ActiveRecord::Base.connection.exec_query(<<-SQL.squish) + SELECT #{@columns.join(",")} + FROM #{@table} + ORDER BY #{@columns.join(",")} + LIMIT #{PAGE_SIZE} OFFSET #{offset} + SQL + Rails.logger.debug("#{self.class.name} fetching offset [#{offset}, #{offset + PAGE_SIZE}[") + break if results.empty? + + offset += PAGE_SIZE + yield results + + break if results.count < PAGE_SIZE + end + end + end +end diff --git a/spec/lib/pseudonymizer/pager_spec.rb b/spec/lib/pseudonymizer/pager_spec.rb new file mode 100644 index 000000000000..22b4e3725fa2 --- /dev/null +++ b/spec/lib/pseudonymizer/pager_spec.rb @@ -0,0 +1,62 @@ +require 'spec_helper' + +describe Pseudonymizer::Pager do + class Counter + @count = 0 + + def increment(*args) + self.count += 1 + end + end + + let(:page_size) { 1 } + let!(:projects) { create_list(:project, 10) } + subject { described_class.new("projects", whitelisted_columns) } + + before do + stub_const("Pseudonymizer::Pager::PAGE_SIZE", page_size) + end + + shared_examples "yield results in page" do + it do + page_count = 0 + result_count = 0 + + subject.pages do |page| + result_count += page.count + page_count += 1 + end + + expect(result_count).to eq(projects.count) + expect(page_count).to eq(projects.count / page_size) + end + end + + context "`id` column is present" do + let(:whitelisted_columns) { %w(id name) } + + describe "#pages" do + it "delegates to #pages_per_id" do + expect(subject).to receive(:pages_per_id) + + subject.pages {|page| nil} + end + + include_examples "yield results in page" + end + end + + context "`id` column is missing" do + let(:whitelisted_columns) { %w(name) } + + describe "#pages" do + it "delegates to #pages_per_offset" do + expect(subject).to receive(:pages_per_offset) + + subject.pages {|page| nil} + end + + include_examples "yield results in page" + end + end +end -- GitLab From a3c4ffa3f6bb5c1966b17d2b4b1ccf8dd6dc0381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Thu, 21 Jun 2018 10:03:06 -0400 Subject: [PATCH 52/63] make pseudonymizer_enable application setting true by default --- db/schema.rb | 2 +- ee/app/models/ee/application_setting.rb | 2 +- ...1221734_add_pseudonymizer_enabled_to_application_settings.rb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/db/schema.rb b/db/schema.rb index b153f7baad52..817b6283a6a6 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -206,7 +206,7 @@ t.string "encrypted_external_auth_client_key_pass_iv" t.string "email_additional_text" t.boolean "enforce_terms", default: false - t.boolean "pseudonymizer_enabled", default: false, null: false + t.boolean "pseudonymizer_enabled", default: true, null: false end create_table "approvals", force: :cascade do |t| diff --git a/ee/app/models/ee/application_setting.rb b/ee/app/models/ee/application_setting.rb index b18fb0694662..78fcd7544d19 100644 --- a/ee/app/models/ee/application_setting.rb +++ b/ee/app/models/ee/application_setting.rb @@ -101,7 +101,7 @@ def defaults slack_app_id: nil, slack_app_secret: nil, slack_app_verification_token: nil, - pseudonymizer_enabled: Settings.pseudonymizer['enabled'] + pseudonymizer_enabled: true ) end end diff --git a/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb b/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb index 7517e78a6187..9d85a66bc431 100644 --- a/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb +++ b/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb @@ -26,6 +26,6 @@ class AddPseudonymizerEnabledToApplicationSettings < ActiveRecord::Migration # disable_ddl_transaction! def change - add_column :application_settings, :pseudonymizer_enabled, :boolean, null: false, default: false + add_column :application_settings, :pseudonymizer_enabled, :boolean, null: false, default: true end end -- GitLab From fbfb71381d8b44dfdcb015f3ab038a3bb8efdd5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Thu, 21 Jun 2018 10:03:43 -0400 Subject: [PATCH 53/63] polish the error handling when object storage is unavailable --- app/workers/pseudonymizer_worker.rb | 4 +++- lib/pseudonymizer/uploader.rb | 13 ++++++++++++- lib/tasks/gitlab/db.rake | 2 ++ spec/lib/pseudonymizer/pager_spec.rb | 8 -------- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/app/workers/pseudonymizer_worker.rb b/app/workers/pseudonymizer_worker.rb index cf1fbf8961ce..51e4e7b9af9b 100644 --- a/app/workers/pseudonymizer_worker.rb +++ b/app/workers/pseudonymizer_worker.rb @@ -4,7 +4,7 @@ class PseudonymizerWorker def perform abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) - return unless Gitlab::CurrentSettings.pseudonymizer_enabled? + abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? options = Pseudonymizer::Options.new( config: YAML.load_file(Gitlab.config.pseudonymizer.manifest), @@ -14,6 +14,8 @@ def perform dumper = Pseudonymizer::Dumper.new(options) uploader = Pseudonymizer::Uploader.new(options, progress_output: File.open(File::NULL, "w")) + abort "The pseudonymizer object storage must be configured." unless uploader.available? + begin dumper.tables_to_csv uploader.upload diff --git a/lib/pseudonymizer/uploader.rb b/lib/pseudonymizer/uploader.rb index a5937a7ed3b8..bdbdb666674b 100644 --- a/lib/pseudonymizer/uploader.rb +++ b/lib/pseudonymizer/uploader.rb @@ -1,4 +1,6 @@ module Pseudonymizer + ObjectStorageUnavailableError = Class.new(StandardError) + class Uploader include Gitlab::Utils::StrongMemoize @@ -22,12 +24,20 @@ def initialize(options, progress_output: nil) @connection_params = self.class.object_store_credentials end + def available? + !connect_to_remote_directory.nil? + rescue ObjectStorageUnavailableError + false + end + def upload progress_output.puts "Uploading output files to remote storage #{remote_directory}:" file_list.each do |file| upload_file(file, remote_directory) end + rescue ObjectStorageUnavailableError + abort "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red) end def cleanup @@ -62,7 +72,8 @@ def remote_directory def connect_to_remote_directory if @connection_params.blank? - abort "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red) + raise ObjectStorageUnavailableError + end connection = ::Fog::Storage.new(@connection_params) diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index e0d7e8bc892d..47c9660643ef 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -83,6 +83,8 @@ namespace :gitlab do dumper = Pseudonymizer::Dumper.new(options) uploader = Pseudonymizer::Uploader.new(options) + abort "There is an error in the pseudonymizer object store configuration." unless uploader.available? + begin dumper.tables_to_csv uploader.upload diff --git a/spec/lib/pseudonymizer/pager_spec.rb b/spec/lib/pseudonymizer/pager_spec.rb index 22b4e3725fa2..8b20a381f923 100644 --- a/spec/lib/pseudonymizer/pager_spec.rb +++ b/spec/lib/pseudonymizer/pager_spec.rb @@ -1,14 +1,6 @@ require 'spec_helper' describe Pseudonymizer::Pager do - class Counter - @count = 0 - - def increment(*args) - self.count += 1 - end - end - let(:page_size) { 1 } let!(:projects) { create_list(:project, 10) } subject { described_class.new("projects", whitelisted_columns) } -- GitLab From eb586c7f0f08290a066c609ef84eb2a94afcca28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Thu, 21 Jun 2018 14:53:14 -0400 Subject: [PATCH 54/63] remove mapping keys for join tables --- lib/pseudonymizer/dumper.rb | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/pseudonymizer/dumper.rb b/lib/pseudonymizer/dumper.rb index 59593119aaa3..62f44c0a5a40 100644 --- a/lib/pseudonymizer/dumper.rb +++ b/lib/pseudonymizer/dumper.rb @@ -21,12 +21,12 @@ def tables_to_csv tables = config[:tables] FileUtils.mkdir_p(output_dir) unless File.directory?(output_dir) - schema_to_yml @output_files = tables.map do |k, v| table_to_csv(k, v[:whitelist], v[:pseudo]) end.compact - + schema_to_yml file_list_to_json + @output_files end @@ -96,12 +96,16 @@ def table_to_schema(table) end def set_schema_column_types(table, type_results) + has_id = type_results.any? {|c| c[:name] == "id" } + type_results.each do |type_result| - @schema[table][type_result[:name]] = type_result[:data_type] + @schema[table.to_s][type_result[:name]] = type_result[:data_type] end - # hard coded because all mapping keys in GL are id - @schema[table]["gl_mapping_key"] = "id" + if has_id + # if there is an ID, it is the mapping_key + @schema[table.to_s]["gl_mapping_key"] = "id" + end end def write_to_csv_file(table, contents) -- GitLab From 376fed3c7e946c70b25c76b661e90f5afa2bff29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 22 Jun 2018 08:33:19 -0400 Subject: [PATCH 55/63] make sure the feature naming is consistent --- ee/app/helpers/ee/application_settings_helper.rb | 6 +++--- {app => ee/app}/workers/pseudonymizer_worker.rb | 0 2 files changed, 3 insertions(+), 3 deletions(-) rename {app => ee/app}/workers/pseudonymizer_worker.rb (100%) diff --git a/ee/app/helpers/ee/application_settings_helper.rb b/ee/app/helpers/ee/application_settings_helper.rb index c464c7ca095b..c85d85efcfbb 100644 --- a/ee/app/helpers/ee/application_settings_helper.rb +++ b/ee/app/helpers/ee/application_settings_helper.rb @@ -36,15 +36,15 @@ def external_authorization_client_pass_help_text end def pseudonymizer_enabled_help_text - _("Enable Pseudonymizer data export") + _("Enable Pseudonymizer data collection") end def pseudonymizer_description_text - _("GitLab will run the pseudonymizer cron job which will output pseudoanonymized data to be processed and analyzed.") + _("GitLab will run the pseudonymizer data collection which will output pseudonymized data to be processed and analyzed.") end def pseudonymizer_disabled_description_text - _("The pseudonymizer database cron job is disabled. When enabled the cron job will send pseudoanonymized data to be processed and analyzed.") + _("The pseudonymizer data collection is disabled. When enabled, it will send pseudonymized data to be processed and analyzed.") end override :visible_attributes diff --git a/app/workers/pseudonymizer_worker.rb b/ee/app/workers/pseudonymizer_worker.rb similarity index 100% rename from app/workers/pseudonymizer_worker.rb rename to ee/app/workers/pseudonymizer_worker.rb -- GitLab From 90e0d25e638673b6d63318aec35f21abbdd65633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 22 Jun 2018 08:33:43 -0400 Subject: [PATCH 56/63] make the application settings EE-CE friendly --- .../admin/application_settings/show.html.haml | 14 ++------------ .../_pseudonymizer_settings.html.haml | 11 +++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) create mode 100644 ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml diff --git a/app/views/admin/application_settings/show.html.haml b/app/views/admin/application_settings/show.html.haml index fef287f88862..09eb5e14b02c 100644 --- a/app/views/admin/application_settings/show.html.haml +++ b/app/views/admin/application_settings/show.html.haml @@ -362,18 +362,6 @@ .settings-content = render partial: 'elasticsearch_form' -- if Gitlab::CurrentSettings.pseudonymizer_can_be_configured? - %section.settings.as-pseudonymizer.no-animate#js-pseudonymizer-settings{ class: ('expanded' if expanded) } - .settings-header - %h4 - = _('Pseudonymizer data collection') - %button.btn.btn-default.js-settings-toggle{ type: 'button' } - = expanded ? _('Collapse') : _('Expand') - %p - = _('Enable or disable the Pseudonymizer data collection.') - .settings-content - = render 'pseudonymizer' - - if Gitlab.com? || Rails.env.development? %section.settings.as-slack.no-animate#js-slack-settings{ class: ('expanded' if expanded) } .settings-header @@ -385,3 +373,5 @@ = _('Geo allows you to replicate your GitLab instance to other geographical locations.') .settings-content = render partial: 'slack' + += render_if_exists 'pseudonymizer_settings' diff --git a/ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml b/ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml new file mode 100644 index 000000000000..70aeabbf9fed --- /dev/null +++ b/ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml @@ -0,0 +1,11 @@ +- if Gitlab::CurrentSettings.pseudonymizer_can_be_configured? + %section.settings.as-pseudonymizer.no-animate#js-pseudonymizer-settings{ class: ('expanded' if expanded) } + .settings-header + %h4 + = _('Pseudonymizer data collection') + %button.btn.btn-default.js-settings-toggle{ type: 'button' } + = expanded ? _('Collapse') : _('Expand') + %p + = _('Enable or disable the Pseudonymizer data collection.') + .settings-content + = render 'pseudonymizer' -- GitLab From 52668210ea2f5f310a5f5ae28988c556322c3ab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 22 Jun 2018 09:42:42 -0400 Subject: [PATCH 57/63] move pseudonymizer files to the correct ee/ location --- {lib => ee/lib}/pseudonymizer/dumper.rb | 0 {lib => ee/lib}/pseudonymizer/filter.rb | 0 {lib => ee/lib}/pseudonymizer/options.rb | 0 {lib => ee/lib}/pseudonymizer/pager.rb | 0 {lib => ee/lib}/pseudonymizer/uploader.rb | 0 ee/lib/tasks/gitlab/db.rake | 26 +++++++++++++++++++ .../spec}/lib/pseudonymizer/dumper_spec.rb | 0 .../spec}/lib/pseudonymizer/pager_spec.rb | 0 .../spec}/lib/pseudonymizer/uploader_spec.rb | 0 lib/tasks/gitlab/db.rake | 23 ---------------- 10 files changed, 26 insertions(+), 23 deletions(-) rename {lib => ee/lib}/pseudonymizer/dumper.rb (100%) rename {lib => ee/lib}/pseudonymizer/filter.rb (100%) rename {lib => ee/lib}/pseudonymizer/options.rb (100%) rename {lib => ee/lib}/pseudonymizer/pager.rb (100%) rename {lib => ee/lib}/pseudonymizer/uploader.rb (100%) create mode 100644 ee/lib/tasks/gitlab/db.rake rename {spec => ee/spec}/lib/pseudonymizer/dumper_spec.rb (100%) rename {spec => ee/spec}/lib/pseudonymizer/pager_spec.rb (100%) rename {spec => ee/spec}/lib/pseudonymizer/uploader_spec.rb (100%) diff --git a/lib/pseudonymizer/dumper.rb b/ee/lib/pseudonymizer/dumper.rb similarity index 100% rename from lib/pseudonymizer/dumper.rb rename to ee/lib/pseudonymizer/dumper.rb diff --git a/lib/pseudonymizer/filter.rb b/ee/lib/pseudonymizer/filter.rb similarity index 100% rename from lib/pseudonymizer/filter.rb rename to ee/lib/pseudonymizer/filter.rb diff --git a/lib/pseudonymizer/options.rb b/ee/lib/pseudonymizer/options.rb similarity index 100% rename from lib/pseudonymizer/options.rb rename to ee/lib/pseudonymizer/options.rb diff --git a/lib/pseudonymizer/pager.rb b/ee/lib/pseudonymizer/pager.rb similarity index 100% rename from lib/pseudonymizer/pager.rb rename to ee/lib/pseudonymizer/pager.rb diff --git a/lib/pseudonymizer/uploader.rb b/ee/lib/pseudonymizer/uploader.rb similarity index 100% rename from lib/pseudonymizer/uploader.rb rename to ee/lib/pseudonymizer/uploader.rb diff --git a/ee/lib/tasks/gitlab/db.rake b/ee/lib/tasks/gitlab/db.rake new file mode 100644 index 000000000000..f45821211527 --- /dev/null +++ b/ee/lib/tasks/gitlab/db.rake @@ -0,0 +1,26 @@ +namespace :gitlab do + namespace :db do + desc 'Output pseudonymity dump of selected tables' + task pseudonymizer: :environment do + abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) + abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? + + options = Pseudonymizer::Options.new( + config: YAML.load_file(Gitlab.config.pseudonymizer.manifest), + output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR'] + ) + + dumper = Pseudonymizer::Dumper.new(options) + uploader = Pseudonymizer::Uploader.new(options) + + abort "There is an error in the pseudonymizer object store configuration." unless uploader.available? + + begin + dumper.tables_to_csv + uploader.upload + ensure + uploader.cleanup + end + end + end +end diff --git a/spec/lib/pseudonymizer/dumper_spec.rb b/ee/spec/lib/pseudonymizer/dumper_spec.rb similarity index 100% rename from spec/lib/pseudonymizer/dumper_spec.rb rename to ee/spec/lib/pseudonymizer/dumper_spec.rb diff --git a/spec/lib/pseudonymizer/pager_spec.rb b/ee/spec/lib/pseudonymizer/pager_spec.rb similarity index 100% rename from spec/lib/pseudonymizer/pager_spec.rb rename to ee/spec/lib/pseudonymizer/pager_spec.rb diff --git a/spec/lib/pseudonymizer/uploader_spec.rb b/ee/spec/lib/pseudonymizer/uploader_spec.rb similarity index 100% rename from spec/lib/pseudonymizer/uploader_spec.rb rename to ee/spec/lib/pseudonymizer/uploader_spec.rb diff --git a/lib/tasks/gitlab/db.rake b/lib/tasks/gitlab/db.rake index 47c9660643ef..139ab70e1259 100644 --- a/lib/tasks/gitlab/db.rake +++ b/lib/tasks/gitlab/db.rake @@ -69,28 +69,5 @@ namespace :gitlab do Gitlab::DowntimeCheck.new.check_and_print(migrations) end - - desc 'Output pseudonymity dump of selected tables' - task pseudonymizer: :environment do - abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) - abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? - - options = Pseudonymizer::Options.new( - config: YAML.load_file(Gitlab.config.pseudonymizer.manifest), - output_dir: ENV['PSEUDONYMIZER_OUTPUT_DIR'] - ) - - dumper = Pseudonymizer::Dumper.new(options) - uploader = Pseudonymizer::Uploader.new(options) - - abort "There is an error in the pseudonymizer object store configuration." unless uploader.available? - - begin - dumper.tables_to_csv - uploader.upload - ensure - uploader.cleanup - end - end end end -- GitLab From e7415e3e8f449e5b0c99ba561e9d9e5e4450e172 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 22 Jun 2018 13:30:14 -0400 Subject: [PATCH 58/63] remove the Gitlab.config.pseudonymizer.enabled configuration --- config/gitlab.yml.example | 2 -- config/initializers/1_settings.rb | 1 - db/schema.rb | 2 +- doc/administration/pseudonymizer.md | 2 -- ee/app/models/ee/application_setting.rb | 8 ++------ .../_pseudonymizer_settings.html.haml | 2 +- ...4_add_pseudonymizer_enabled_to_application_settings.rb | 2 +- 7 files changed, 5 insertions(+), 14 deletions(-) diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index 052f3908658e..4f7fa4207f98 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -732,7 +732,6 @@ production: &base ## Pseudonymizer exporter pseudonymizer: - enabled: false # Tables manifest that specifies the fields to extract and pseudonymize. manifest: config/pseudonymizer.yml upload: @@ -896,7 +895,6 @@ test: backup: path: tmp/tests/backups pseudonymizer: - enabled: false manifest: config/pseudonymizer.yml upload: # The remote 'directory' to store the CSV files. For S3, this would be the bucket name. diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index a99a6c14e9cf..7f230c892410 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -478,7 +478,6 @@ # Pseudonymizer # Settings['pseudonymizer'] ||= Settingslogic.new({}) -Settings.pseudonymizer['enabled'] = false if Settings.pseudonymizer['enabled'].nil? Settings.pseudonymizer['manifest'] = Settings.absolute(Settings.pseudonymizer['manifest'] || Rails.root.join("config/pseudonymizer.yml")) Settings.pseudonymizer['upload'] ||= Settingslogic.new({ 'remote_directory' => nil, 'connection' => nil }) # Settings.pseudonymizer['upload']['multipart_chunk_size'] ||= 104857600 diff --git a/db/schema.rb b/db/schema.rb index 817b6283a6a6..b153f7baad52 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -206,7 +206,7 @@ t.string "encrypted_external_auth_client_key_pass_iv" t.string "email_additional_text" t.boolean "enforce_terms", default: false - t.boolean "pseudonymizer_enabled", default: true, null: false + t.boolean "pseudonymizer_enabled", default: false, null: false end create_table "approvals", force: :cascade do |t| diff --git a/doc/administration/pseudonymizer.md b/doc/administration/pseudonymizer.md index 8555e09df93f..b27e1afa7419 100644 --- a/doc/administration/pseudonymizer.md +++ b/doc/administration/pseudonymizer.md @@ -30,7 +30,6 @@ To configure the pseudonymizer, you need to: the values you want: ```ruby - gitlab_rails['pseudonymizer_enabled'] = true gitlab_rails['pseudonymizer_manifest'] = 'config/pseudonymizer.yml' gitlab_rails['pseudonymizer_upload_remote_directory'] = 'gitlab-elt' gitlab_rails['pseudonymizer_upload_connection'] = { @@ -64,7 +63,6 @@ To configure the pseudonymizer, you need to: ```yaml pseudonymizer: - enabled: true manifest: config/pseudonymizer.yml upload: remote_directory: 'gitlab-elt' # The bucket name diff --git a/ee/app/models/ee/application_setting.rb b/ee/app/models/ee/application_setting.rb index 78fcd7544d19..7a3cc80a5f24 100644 --- a/ee/app/models/ee/application_setting.rb +++ b/ee/app/models/ee/application_setting.rb @@ -101,7 +101,7 @@ def defaults slack_app_id: nil, slack_app_secret: nil, slack_app_verification_token: nil, - pseudonymizer_enabled: true + pseudonymizer_enabled: false ) end end @@ -110,12 +110,8 @@ def pseudonymizer_available? License.feature_available?(:pseudonymizer) end - def pseudonymizer_can_be_configured? - Settings.pseudonymizer.enabled && pseudonymizer_available? - end - def pseudonymizer_enabled? - pseudonymizer_can_be_configured? && super + pseudonymizer_available? && super end def should_check_namespace_plan? diff --git a/ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml b/ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml index 70aeabbf9fed..f65acf10ff86 100644 --- a/ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml +++ b/ee/app/views/admin/application_settings/_pseudonymizer_settings.html.haml @@ -1,4 +1,4 @@ -- if Gitlab::CurrentSettings.pseudonymizer_can_be_configured? +- if Gitlab::CurrentSettings.pseudonymizer_available? %section.settings.as-pseudonymizer.no-animate#js-pseudonymizer-settings{ class: ('expanded' if expanded) } .settings-header %h4 diff --git a/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb b/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb index 9d85a66bc431..7517e78a6187 100644 --- a/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb +++ b/ee/db/migrate/20180531221734_add_pseudonymizer_enabled_to_application_settings.rb @@ -26,6 +26,6 @@ class AddPseudonymizerEnabledToApplicationSettings < ActiveRecord::Migration # disable_ddl_transaction! def change - add_column :application_settings, :pseudonymizer_enabled, :boolean, null: false, default: true + add_column :application_settings, :pseudonymizer_enabled, :boolean, null: false, default: false end end -- GitLab From 3735e9fad9f1d152209f980c9ba9939471dee462 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 22 Jun 2018 15:09:27 -0400 Subject: [PATCH 59/63] apply feedback --- doc/administration/pseudonymizer.md | 4 +++- ee/app/workers/pseudonymizer_worker.rb | 16 +++++++++++++--- ee/lib/pseudonymizer/dumper.rb | 1 + 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/doc/administration/pseudonymizer.md b/doc/administration/pseudonymizer.md index b27e1afa7419..df08a9453582 100644 --- a/doc/administration/pseudonymizer.md +++ b/doc/administration/pseudonymizer.md @@ -22,7 +22,9 @@ To configure the pseudonymizer, you need to: - Provide a manifest file that describes which fields should be included or pseudonymized ([example `manifest.yml` file](https://gitlab.com/gitlab-org/gitlab-ee/tree/master/config/pseudonymizer.yml)). -- Use an object storage + A default manifest is provided with the GitLab installation. Using a relative file path will be resolved from the Rails root. + Alternatively, you can use an absolute file path. +- Use an object storage and specify the connection parameters in the `pseudonymizer.upload.connection` configuration option. **For Omnibus installations:** diff --git a/ee/app/workers/pseudonymizer_worker.rb b/ee/app/workers/pseudonymizer_worker.rb index 51e4e7b9af9b..829226d5f850 100644 --- a/ee/app/workers/pseudonymizer_worker.rb +++ b/ee/app/workers/pseudonymizer_worker.rb @@ -3,8 +3,15 @@ class PseudonymizerWorker include CronjobQueue def perform - abort "The pseudonymizer is not available with this license." unless License.feature_available?(:pseudonymizer) - abort "The pseudonymizer is disabled." unless Gitlab::CurrentSettings.pseudonymizer_enabled? + unless License.feature_available?(:pseudonymizer) + Rails.logger.warn("The pseudonymizer is not available with this license.") + return + end + + unless unless Gitlab::CurrentSettings.pseudonymizer_enabled? + Rails.logger.info("The pseudonymizer is disabled.") + return + end options = Pseudonymizer::Options.new( config: YAML.load_file(Gitlab.config.pseudonymizer.manifest), @@ -14,7 +21,10 @@ def perform dumper = Pseudonymizer::Dumper.new(options) uploader = Pseudonymizer::Uploader.new(options, progress_output: File.open(File::NULL, "w")) - abort "The pseudonymizer object storage must be configured." unless uploader.available? + unless uploader.available? + Rails.logger.error("The pseudonymizer object storage must be configured.") + return + end begin dumper.tables_to_csv diff --git a/ee/lib/pseudonymizer/dumper.rb b/ee/lib/pseudonymizer/dumper.rb index 62f44c0a5a40..fa69281b4485 100644 --- a/ee/lib/pseudonymizer/dumper.rb +++ b/ee/lib/pseudonymizer/dumper.rb @@ -24,6 +24,7 @@ def tables_to_csv @output_files = tables.map do |k, v| table_to_csv(k, v[:whitelist], v[:pseudo]) end.compact + schema_to_yml file_list_to_json -- GitLab From 464bccf5af9db7104229f22b73890fe23714aa2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 22 Jun 2018 15:18:34 -0400 Subject: [PATCH 60/63] remove dupplicate unless --- ee/app/workers/pseudonymizer_worker.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ee/app/workers/pseudonymizer_worker.rb b/ee/app/workers/pseudonymizer_worker.rb index 829226d5f850..a0a4e11983c3 100644 --- a/ee/app/workers/pseudonymizer_worker.rb +++ b/ee/app/workers/pseudonymizer_worker.rb @@ -8,7 +8,7 @@ def perform return end - unless unless Gitlab::CurrentSettings.pseudonymizer_enabled? + unless Gitlab::CurrentSettings.pseudonymizer_enabled? Rails.logger.info("The pseudonymizer is disabled.") return end -- GitLab From 521e3a7a07132d25084f9fe97fe7975e90216a2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 22 Jun 2018 15:34:32 -0400 Subject: [PATCH 61/63] remove unused whitespace --- config/gitlab.yml.example | 1 - 1 file changed, 1 deletion(-) diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index 4f7fa4207f98..dbc521c3d401 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -263,7 +263,6 @@ production: &base # Remove outdated repository archives repository_archive_cache_worker: cron: "0 * * * *" - # Verify custom GitLab Pages domains pages_domain_verification_cron_worker: cron: "*/15 * * * *" -- GitLab From 56672d4aa06e6aa2d9bca588b7beac4b0be30b9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mica=C3=ABl=20Bergeron?= Date: Fri, 22 Jun 2018 16:43:50 -0400 Subject: [PATCH 62/63] apply feedback --- app/views/admin/application_settings/show.html.haml | 2 +- ee/app/helpers/ee/application_settings_helper.rb | 4 ++-- ee/app/workers/pseudonymizer_worker.rb | 10 +--------- ee/lib/pseudonymizer/pager.rb | 4 ++-- ee/lib/pseudonymizer/uploader.rb | 10 +++++----- 5 files changed, 11 insertions(+), 19 deletions(-) diff --git a/app/views/admin/application_settings/show.html.haml b/app/views/admin/application_settings/show.html.haml index 09eb5e14b02c..f2e2cd3e2fab 100644 --- a/app/views/admin/application_settings/show.html.haml +++ b/app/views/admin/application_settings/show.html.haml @@ -374,4 +374,4 @@ .settings-content = render partial: 'slack' -= render_if_exists 'pseudonymizer_settings' += render_if_exists 'admin/application_settings/pseudonymizer_settings', expanded: expanded diff --git a/ee/app/helpers/ee/application_settings_helper.rb b/ee/app/helpers/ee/application_settings_helper.rb index c85d85efcfbb..006e3c738d39 100644 --- a/ee/app/helpers/ee/application_settings_helper.rb +++ b/ee/app/helpers/ee/application_settings_helper.rb @@ -40,11 +40,11 @@ def pseudonymizer_enabled_help_text end def pseudonymizer_description_text - _("GitLab will run the pseudonymizer data collection which will output pseudonymized data to be processed and analyzed.") + _("GitLab will run a background job that will produce pseudonymized CSVs of the GitLab database that will be uploaded to your configured object storage directory.") end def pseudonymizer_disabled_description_text - _("The pseudonymizer data collection is disabled. When enabled, it will send pseudonymized data to be processed and analyzed.") + _("The pseudonymizer data collection is disabled. When enabled, GitLab will run a background job that will produce pseudonymized CSVs of the GitLab database that will be uploaded to your configured object storage directory.") end override :visible_attributes diff --git a/ee/app/workers/pseudonymizer_worker.rb b/ee/app/workers/pseudonymizer_worker.rb index a0a4e11983c3..ce268c97a34e 100644 --- a/ee/app/workers/pseudonymizer_worker.rb +++ b/ee/app/workers/pseudonymizer_worker.rb @@ -3,15 +3,7 @@ class PseudonymizerWorker include CronjobQueue def perform - unless License.feature_available?(:pseudonymizer) - Rails.logger.warn("The pseudonymizer is not available with this license.") - return - end - - unless Gitlab::CurrentSettings.pseudonymizer_enabled? - Rails.logger.info("The pseudonymizer is disabled.") - return - end + return unless Gitlab::CurrentSettings.pseudonymizer_enabled? options = Pseudonymizer::Options.new( config: YAML.load_file(Gitlab.config.pseudonymizer.manifest), diff --git a/ee/lib/pseudonymizer/pager.rb b/ee/lib/pseudonymizer/pager.rb index ab0e066c621b..4c1b9f2b1e46 100644 --- a/ee/lib/pseudonymizer/pager.rb +++ b/ee/lib/pseudonymizer/pager.rb @@ -29,7 +29,7 @@ def pages_per_id(&block) ORDER BY id LIMIT #{PAGE_SIZE} SQL - Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}, +#{PAGE_SIZE}[") + Rails.logger.debug("#{self.class.name} fetch ids [#{id_offset}..+#{PAGE_SIZE}]") break if results.empty? id_offset = results.last["id"].to_i @@ -50,7 +50,7 @@ def pages_per_offset(&block) ORDER BY #{@columns.join(",")} LIMIT #{PAGE_SIZE} OFFSET #{offset} SQL - Rails.logger.debug("#{self.class.name} fetching offset [#{offset}, #{offset + PAGE_SIZE}[") + Rails.logger.debug("#{self.class.name} fetching offset [#{offset}..#{offset + PAGE_SIZE}]") break if results.empty? offset += PAGE_SIZE diff --git a/ee/lib/pseudonymizer/uploader.rb b/ee/lib/pseudonymizer/uploader.rb index bdbdb666674b..65c69ade374a 100644 --- a/ee/lib/pseudonymizer/uploader.rb +++ b/ee/lib/pseudonymizer/uploader.rb @@ -37,7 +37,7 @@ def upload upload_file(file, remote_directory) end rescue ObjectStorageUnavailableError - abort "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly".color(:red) + abort "Cannot upload files, make sure the `pseudonimizer.upload.connection` is set properly" end def cleanup @@ -45,9 +45,9 @@ def cleanup progress_output.print "Deleting tmp directory #{@output_dir} ... " FileUtils.rm_rf(@output_dir) - progress_output.puts "done".color(:green) + progress_output.puts "done" rescue - progress_output.puts "failed".color(:red) + progress_output.puts "failed" end private @@ -60,9 +60,9 @@ def upload_file(file, directory) if directory.files.create(key: File.join(@upload_dir, File.basename(file)), body: File.open(file), public: false) - progress_output.puts "done".color(:green) + progress_output.puts "done" else - progress_output.puts "failed".color(:red) + progress_output.puts "failed" end end -- GitLab From 4b90a8f8b753103080ee8f178d53868a3613fdea Mon Sep 17 00:00:00 2001 From: Jacob Schatz Date: Mon, 25 Jun 2018 09:43:55 -0400 Subject: [PATCH 63/63] Add line back in. --- config/gitlab.yml.example | 1 + 1 file changed, 1 insertion(+) diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index dbc521c3d401..c61bb52c694f 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -263,6 +263,7 @@ production: &base # Remove outdated repository archives repository_archive_cache_worker: cron: "0 * * * *" + # Verify custom GitLab Pages domains pages_domain_verification_cron_worker: cron: "*/15 * * * *" -- GitLab