zoom-sync.rb 10.4 KB
Newer Older
Alex Hanselka's avatar
Alex Hanselka committed
1 2 3 4
# This script downloads all the recordings from a Zoom account and
# uploads them to a directory on Google Drive. The first attempt
# to upload to Google Drive will attempt to authenticate and
# create a local config.json with the credentials.
5 6
#
# +# Also prerequisite: apt-get install mediainfo (or apk add mediainfo)
Stan Hu's avatar
Stan Hu committed
7
require 'date'
Alex Hanselka's avatar
Alex Hanselka committed
8 9 10 11 12 13
require 'json'
require 'pp'
require 'net/http'
require 'open-uri'
require 'google_drive'
require 'yaml'
Stan Hu's avatar
Stan Hu committed
14
require 'jwt'
15
require 'ostruct'
Stan Hu's avatar
Stan Hu committed
16
require 'httparty'
17
require 'mediainfo'
Alex Hanselka's avatar
Alex Hanselka committed
18

Stan Hu's avatar
Stan Hu committed
19 20 21 22 23 24 25 26 27
module GoogleDrive
  # Monkey patches google-drive-ruby to support shared drivers
  class Session
    def root_collection
      @root_collection ||= file_by_id(ENV['GOOGLE_DRIVE_ID'])
    end
  end
end

Alex Hanselka's avatar
Alex Hanselka committed
28 29
# Client to access Zoom.us recordings and paginate records
class ZoomClient
Stan Hu's avatar
Stan Hu committed
30
  include HTTParty
Alex Hanselka's avatar
Alex Hanselka committed
31
  MAX_PAGE_SIZE = 300
Stan Hu's avatar
Stan Hu committed
32
  RECORDINGS_URL = 'https://api.zoom.us/v2/users/USER_ID/recordings'.freeze
Stan Hu's avatar
Stan Hu committed
33
  USERS_URL = 'https://api.zoom.us/v2/users'.freeze
Alex Hanselka's avatar
Alex Hanselka committed
34 35 36

  attr_reader :api_key, :api_secret

Stan Hu's avatar
Stan Hu committed
37 38 39
  headers 'Accept' => 'application/json'
  headers 'Content-Type' => 'application/json'

40 41 42
  def initialize(config)
    @api_key = config['api_key']
    @api_secret = config['api_secret']
Alex Hanselka's avatar
Alex Hanselka committed
43 44 45
  end

  def users
Stan Hu's avatar
Stan Hu committed
46
    paginated_get(USERS_URL, 'users')
Alex Hanselka's avatar
Alex Hanselka committed
47 48
  end

Stan Hu's avatar
Stan Hu committed
49 50 51
  # Since we no longer delete recordings, we need to time-bound the
  # recording date to avoid having to process an inordinate number of
  # recordings.
52
  def recordings(user_id:, from: Date.today.prev_day, to: Date.today)
Stan Hu's avatar
Stan Hu committed
53 54 55
    paginated_get(RECORDINGS_URL.sub('USER_ID', user_id),
                  'meetings',
                  { 'from' => from.to_s, 'to' => to.to_s })
Alex Hanselka's avatar
Alex Hanselka committed
56 57
  end

Stan Hu's avatar
Stan Hu committed
58 59 60 61 62 63 64 65
  def download_file(url, filename)
    bytes_written = 0
    File.open(filename, 'w') do |file|
      file.binmode
      self.class.get(url, follow_redirects: true) do |fragment|
        bytes_written += file.write(fragment)
      end
    end
Alex Hanselka's avatar
Alex Hanselka committed
66

Stan Hu's avatar
Stan Hu committed
67 68 69 70
    bytes_written
  end

  def access_token
Stan Hu's avatar
Stan Hu committed
71
    JWT.encode({ iss: api_key, exp: Time.now.to_i + 86_400 }, api_secret, 'HS256', { typ: 'JWT' })
Alex Hanselka's avatar
Alex Hanselka committed
72 73
  end

Stan Hu's avatar
Stan Hu committed
74 75
  private

Stan Hu's avatar
Stan Hu committed
76 77 78 79 80 81
  def request_headers
    {
      'Accept' => 'application/json',
      'Content-Type' => 'application/json',
      'Authorization' => "Bearer #{access_token}"
    }
Alex Hanselka's avatar
Alex Hanselka committed
82 83
  end

Stan Hu's avatar
Stan Hu committed
84 85
  def paginated_get(url, item_name, options = {})
    options['page_size'] = MAX_PAGE_SIZE
Alex Hanselka's avatar
Alex Hanselka committed
86 87 88 89
    page_number = 1

    Enumerator.new do |block|
      loop do
Stan Hu's avatar
Stan Hu committed
90 91
        options['page_number'] = page_number
        res = self.class.get(url, query: options, headers: request_headers)
Alex Hanselka's avatar
Alex Hanselka committed
92

Stan Hu's avatar
Stan Hu committed
93
        if res.code != 200
Alex Hanselka's avatar
Alex Hanselka committed
94 95 96 97 98
          puts "Error retrieving #{url}: #{res.code}"
          exit
        end

        body = JSON.parse(res.body)
Stan Hu's avatar
Stan Hu committed
99

Alex Hanselka's avatar
Alex Hanselka committed
100 101
        total_pages = body['page_count']

Stan Hu's avatar
Stan Hu committed
102 103 104
        break unless total_pages

        body.fetch(item_name, {}).each do |element|
Alex Hanselka's avatar
Alex Hanselka committed
105 106 107 108
          block.yield element
        end

        break if page_number >= total_pages
Stan Hu's avatar
Stan Hu committed
109

Alex Hanselka's avatar
Alex Hanselka committed
110 111 112 113 114 115 116 117
        page_number += 1
      end
    end
  end
end

# Client to access Google Drive
class GoogleDriveClient
Stan Hu's avatar
Stan Hu committed
118
  attr_reader :session, :gdrive_id
Alex Hanselka's avatar
Alex Hanselka committed
119

Stan Hu's avatar
Stan Hu committed
120
  def initialize(config, gdrive_id)
121
    @session = GoogleDrive::Session.from_service_account_key(config)
Stan Hu's avatar
Stan Hu committed
122 123 124 125
    @gdrive_id = gdrive_id
  end

  def base_collection
Stan Hu's avatar
Stan Hu committed
126
    @session.root_collection
Stan Hu's avatar
Stan Hu committed
127 128 129 130 131
  end

  # If you want to store videos in a folder, change this
  def base_folder
    []
Alex Hanselka's avatar
Alex Hanselka committed
132 133 134
  end

  def valid_file?(folders, filename, file_size)
Stan Hu's avatar
Stan Hu committed
135
    dest_folder = base_collection.file_by_title(folders)
Alex Hanselka's avatar
Alex Hanselka committed
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153

    return unless dest_folder

    dest_file = dest_folder.file_by_title(filename)
    puts "Checking for existence #{filename}..."

    return false unless dest_file

    dest_file_size = dest_file.size.to_i

    if dest_file_size != file_size
      puts "Google Drive has #{filename} with #{dest_file_size} bytes, expecting #{file_size}"
    end

    dest_file_size == file_size
  end

  def mkdir(sub_directories)
Stan Hu's avatar
Stan Hu committed
154
    current_path = base_folder
Alex Hanselka's avatar
Alex Hanselka committed
155 156 157
    dest_folder = nil

    sub_directories.each do |dir|
Stan Hu's avatar
Stan Hu committed
158
      current_folder = base_collection.file_by_title(current_path)
Alex Hanselka's avatar
Alex Hanselka committed
159
      current_path += [dir]
Stan Hu's avatar
Stan Hu committed
160
      dest_folder = base_collection.file_by_title(current_path)
Alex Hanselka's avatar
Alex Hanselka committed
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181

      unless dest_folder
        puts "Creating destination folder #{current_path}"
        dest_folder = current_folder.create_subcollection(dir)
      end
    end

    dest_folder
  end

  def upload_video(sub_directories, filename, file_size)
    dest_folder = mkdir(sub_directories)

    puts "Uploading #{filename} to #{sub_directories}"
    dest_filename = dest_folder.file_by_title(filename)

    if dest_filename
      puts "File #{filename} already exists, removing"
      dest_filename.delete
    end

Stan Hu's avatar
Stan Hu committed
182
    begin
Stan Hu's avatar
Stan Hu committed
183 184 185
      # Google Drive will convert a .txt to a Google Docs file and strip
      # the extension, which makes it hard to check for dupliates.
      uploaded = dest_folder.upload_from_file(filename, nil, convert: false)
Stan Hu's avatar
Stan Hu committed
186 187 188 189
    rescue Google::Apis::ServerError => e
      puts "Google API error: #{e}"
      return false
    end
Alex Hanselka's avatar
Alex Hanselka committed
190 191 192

    # Google Drive treats anything < 50K as 0 bytes, so we
    # have to assume it was uploaded if we got something back
Stan Hu's avatar
Stan Hu committed
193
    return true if uploaded && file_size < 51_200
Alex Hanselka's avatar
Alex Hanselka committed
194 195 196 197 198 199 200

    uploaded.size.to_i == file_size
  end
end

# Iterates through all Zoom.us recordings and transfers them to a folder
# in Google Drive
Stan Hu's avatar
Stan Hu committed
201 202

# rubocop:disable Metrics/ClassLength
Alex Hanselka's avatar
Alex Hanselka committed
203
class ZoomSyncher
Stan Hu's avatar
Stan Hu committed
204
  attr_reader :config, :client, :gdrive_client
Alex Hanselka's avatar
Alex Hanselka committed
205

Stan Hu's avatar
Stan Hu committed
206 207
  MAX_FILE_SIZE_BYTES = 2_000_000_000 # 2 GB
  REC_REGEX = /\[\s*REC\s*\]/i.freeze
Stan Hu's avatar
Stan Hu committed
208

Alex Hanselka's avatar
Alex Hanselka committed
209
  def initialize
Stan Hu's avatar
Stan Hu committed
210 211
    @config = load_zoom_config
    @client = ZoomClient.new(@config)
Stan Hu's avatar
Stan Hu committed
212
    @gdrive_client = GoogleDriveClient.new(load_gdrive_config, load_gdrive_id)
Alex Hanselka's avatar
Alex Hanselka committed
213 214 215
  end

  def sync
216 217 218 219 220 221 222
    start_date = ENV['START_DATE'] || Date.today.prev_day
    end_date = ENV['END_DATE'] || Date.today
    # Restrict scan to certain users
    user_email = ENV['EMAIL']

    puts "Searching for recordings from #{start_date} to #{end_date}"

Alex Hanselka's avatar
Alex Hanselka committed
223 224
    client.users.each do |user|
      host_id = user['id']
225 226
      next if user_email && user['email'] != user_email

Alex Hanselka's avatar
Alex Hanselka committed
227 228
      puts "Scanning recordings for #{user['email']}"

229
      client.recordings(user_id: host_id, from: start_date, to: end_date).each do |recording|
Stan Hu's avatar
Stan Hu committed
230
        sync_recording(recording, user['email'])
Alex Hanselka's avatar
Alex Hanselka committed
231 232 233 234 235 236 237 238
      end

      # Zoom rate limits API requests, so pause between users
      # to honor their limits
      sleep 1
    end
  end

Stan Hu's avatar
Stan Hu committed
239 240 241 242 243 244 245 246 247 248 249 250
  def sync?(item, email)
    return true if item['topic'].match(REC_REGEX)

    config['allow_list'].each do |entry|
      return true if email == entry['email'] && item['topic'].match(entry['regex'])
    end

    false
  end

  def sync_recording(item, email)
    unless sync?(item, email)
251 252 253 254
      puts "Skipping '#{item['topic']}' as it doesn't include [REC]"
      return
    end

Stan Hu's avatar
Stan Hu committed
255
    topic = item['topic'].gsub(REC_REGEX, '').strip
Alex Hanselka's avatar
Alex Hanselka committed
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
    recording_files = item['recording_files']

    return unless recording_files

    puts "Found #{recording_files.count} recordings"

    # There can be multiple files with the same UUID. Include
    # the index to ensure that we download all recordings.
    recording_files.each_with_index do |file, index|
      event_date = Date.rfc3339(file['recording_start']).to_s
      uuid = item['uuid']
      file_type = file['file_type']

      puts "Found recording: #{topic} with type #{file_type}"

      unless file_type
        puts "Skipping #{topic} since no file type is available yet; recording is likely still being processed"
        next
      end

      ext = file_extension(file_type)
277
      folder = subfolder(email, topic, event_date, file_type)
Alex Hanselka's avatar
Alex Hanselka committed
278
      filename = sanitize_filename("#{topic}-#{event_date}-#{uuid}-#{index}") + ".#{ext}"
Stan Hu's avatar
Stan Hu committed
279
      video_url = file['download_url'] + "?access_token=#{client.access_token}"
Alex Hanselka's avatar
Alex Hanselka committed
280 281
      file_size = file['file_size']

Stan Hu's avatar
Stan Hu committed
282
      if file_size.to_i > MAX_FILE_SIZE_BYTES
Stan Hu's avatar
Stan Hu committed
283
        puts "Skipping #{filename}, file size is #{file_size}, limit is #{MAX_FILE_SIZE_BYTES}"
Stan Hu's avatar
Stan Hu committed
284
      elsif gdrive_client.valid_file?(folder, filename, file_size)
Alex Hanselka's avatar
Alex Hanselka committed
285 286 287 288 289 290 291 292 293
        puts "Skipping #{filename}, already exists"
      else
        success = download_video(video_url, filename, file_size)

        unless success
          puts "Failed to download video #{filename}, skipping"
          next
        end

294 295
        if should_upload?(filename, file_type)
          success = gdrive_client.upload_video(folder, filename, file_size)
Alex Hanselka's avatar
Alex Hanselka committed
296

297
          puts "Failed to upload video #{filename}, skipping" unless success
298
        else
Stan Hu's avatar
Stan Hu committed
299
          puts "Skipping #{filename} because file does not have an audio track > 30 s"
300
        end
Alex Hanselka's avatar
Alex Hanselka committed
301 302 303 304 305 306 307 308

        File.delete(filename)
      end
    end
  end

  private

309
  def load_zoom_config
Stan Hu's avatar
Stan Hu committed
310 311 312
    config = YAML.load_file('zoom_sync.yml') if File.exist?('zoom_sync.yml')
    config ||= {}

Stan Hu's avatar
Stan Hu committed
313
    if ENV['ZOOM_API_KEY'] && ENV['ZOOM_API_SECRET']
Stan Hu's avatar
Stan Hu committed
314 315 316 317 318 319
      config.merge!(
        {
          'api_key' => ENV['ZOOM_API_KEY'],
          'api_secret' => ENV['ZOOM_API_SECRET']
        }
      )
320
    end
Stan Hu's avatar
Stan Hu committed
321 322

    config
323 324 325 326 327 328
  end

  # Returns a config object that responds to client_id, etc.:
  # https://www.rubydoc.info/gems/google_drive/2.1.1/GoogleDrive%2FSession.from_config
  def load_gdrive_config
    data =
Stan Hu's avatar
Stan Hu committed
329
      ENV['GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY'] || File.read('config.json')
330

331
    StringIO.new(data)
332 333
  end

Stan Hu's avatar
Stan Hu committed
334 335 336 337 338 339
  def load_gdrive_id
    ENV['GOOGLE_DRIVE_ID'].tap do |gdrive_id|
      raise 'GOOGLE_DRIVE_ID not set' unless gdrive_id
    end
  end

340 341
  # Discard any audio or videos that aren't longer than 30 seconds
  def should_upload?(filename, file_type)
Stan Hu's avatar
Stan Hu committed
342 343 344 345
    file_type = file_type.downcase

    return true if %w[chat transcript].include?(file_type)
    return false if file_type == 'timeline'
346 347 348 349 350 351

    duration = MediaInfo.from(filename)&.audio&.duration

    duration.to_i > 30
  end

Alex Hanselka's avatar
Alex Hanselka committed
352
  def download_video(url, filename, file_size)
Stan Hu's avatar
Stan Hu committed
353
    warn "Downloading #{filename}"
Alex Hanselka's avatar
Alex Hanselka committed
354 355 356 357
    attempts = 0

    loop do
      begin
Stan Hu's avatar
Stan Hu committed
358
        bytes = client.download_file(url, filename)
Alex Hanselka's avatar
Alex Hanselka committed
359 360 361 362 363 364 365 366 367 368 369 370 371 372

        return true if bytes == file_size

        puts "Mismatch in file size: downloaded #{bytes}, expected #{file_size}, retrying..."
      rescue OpenURI::HTTPError => e
        puts "Error downloading file: #{e}, retrying..."
      end

      attempts += 1

      return false if attempts > 2
    end
  end

373
  def subfolder(prefix, topic, event_date, file_type)
Stan Hu's avatar
Stan Hu committed
374
    base_name = "#{prefix}-#{topic}"
375
    return [base_name, event_date, 'audio'] if file_type.casecmp('m4a').zero?
Alex Hanselka's avatar
Alex Hanselka committed
376

377
    [base_name, event_date]
Alex Hanselka's avatar
Alex Hanselka committed
378 379 380 381 382 383
  end

  def file_extension(file_type)
    file_type = file_type.downcase

    return 'txt' if file_type == 'chat'
Stan Hu's avatar
Stan Hu committed
384
    return 'transcript.txt' if file_type == 'transcript'
Alex Hanselka's avatar
Alex Hanselka committed
385 386 387 388 389 390 391

    file_type
  end

  def sanitize_filename(filename)
    filename.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
  end
Lukas 'Eipi' Eipert's avatar
Lukas 'Eipi' Eipert committed
392
end
Stan Hu's avatar
Stan Hu committed
393
# rubocop:enable Metrics/ClassLength
Lukas 'Eipi' Eipert's avatar
Lukas 'Eipi' Eipert committed
394

Alex Hanselka's avatar
Alex Hanselka committed
395 396 397 398
if $PROGRAM_NAME == __FILE__
  syncher = ZoomSyncher.new
  syncher.sync
end