sync.py 5.73 KB
Newer Older
1 2 3 4
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from scipy.signal import medfilt
Rémi Bèges's avatar
Rémi Bèges committed
5 6
from scipy.ndimage.interpolation import shift
from scipy.ndimage import zoom
7 8
from sklearn.metrics import log_loss
import scipy
Rémi Bèges's avatar
Rémi Bèges committed
9
import warnings
10

Rémi Bèges's avatar
Rémi Bèges committed
11 12
from autosync.dataset import *
from autosync.utils import *
13

14 15 16 17 18 19 20 21 22
def sync(movie, sub, save_to=None):
    '''
    Synchronizes subtitle with movie's audio

    Attributes:
        movie       Filename of the media to extract audio from
        sub         Filename of the subtitle to synchronize
        save_to     Export filename for the synced subtitle.
    '''
Rémi Bèges's avatar
Rémi Bèges committed
23
    yield 'Extracting audio...'
Rémi Bèges's avatar
Rémi Bèges committed
24
    test_audio = fetch_dataset_slices(movie, test_percent=0, store_wav_file=False, globbing=False)
25 26

    yield 'Analyzing audio...'
Rémi Bèges's avatar
Rémi Bèges committed
27
    folder = os.path.dirname(os.path.dirname(__file__))
Rémi Bèges's avatar
Rémi Bèges committed
28 29 30 31 32 33
    graph = tf.Graph()
    with graph.as_default():
        with tf.Session() as sess:
            meta_graph_def = tf.saved_model.loader.load(
                sess,
                [tf.saved_model.tag_constants.SERVING],
Rémi Bèges's avatar
Rémi Bèges committed
34
                os.path.join(folder, 'saved_models', 'm17_dataclean_300k_balanced'),
Rémi Bèges's avatar
Rémi Bèges committed
35 36 37 38 39 40
            )
            signature = meta_graph_def.signature_def
            signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
            print('Model signature:', signature)
            x_tensor_name = signature[signature_key].inputs['audio_mfcc'].name
            y_tensor_name = signature[signature_key].outputs['has_speech'].name
41

Rémi Bèges's avatar
Rémi Bèges committed
42 43
            x = sess.graph.get_tensor_by_name(x_tensor_name)
            y = sess.graph.get_tensor_by_name(y_tensor_name)
44

Rémi Bèges's avatar
Rémi Bèges committed
45 46 47
            result = sess.run(y, feed_dict={
                x: test_audio,
            })
48 49


Rémi Bèges's avatar
Rémi Bèges committed
50 51 52 53
    block = 0.032
    block_ms = block * 1000
    framerate = 16000
    block_len = int(block_ms * framerate / 1000)
54

Rémi Bèges's avatar
Rémi Bèges committed
55 56
    xaxis = np.arange(0, result.shape[0], 1)
    xaxis = xaxis * block
57

Rémi Bèges's avatar
Rémi Bèges committed
58
    yield 'Parsing subtitle...'
Rémi Bèges's avatar
Rémi Bèges committed
59 60 61 62 63
    speech_prob = result[:, 0]
    sub_prob = sub_to_sequence(sub, fps=16000, total_length=speech_prob.shape[0] * block_len)
    sub_prob = sub_prob.reshape((sub_prob.shape[0] // block_len, block_len))
    sub_prob = contains_speech(sub_prob, block_ms)
    sub_prob = sub_prob[:, 0]
64

Rémi Bèges's avatar
Rémi Bèges committed
65
    def loss_fn_factory(speech, sub):
Rémi Bèges's avatar
Rémi Bèges committed
66 67 68 69 70 71 72
        def loss(params):
            #import pdb; pdb.set_trace()
            delay, fps = params
            delay = int(delay)

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
73 74 75 76 77
                # Hypothesis is 1st order performs better than default 3rd order
                sub_delayed = zoom(sub, fps, order=1)
                # Rounding avoid truncation when performing type conversion to uint8 below
                # And results in a more accurate result
                sub_delayed = np.round(sub_delayed)
Rémi Bèges's avatar
Rémi Bèges committed
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93

            # Zoom has reduced array size. Pad with 0s at the end
            if sub_delayed.shape[0] < sub.shape[0]:
                diff = sub.shape[0] - sub_delayed.shape[0]
                sub_delayed = np.pad(sub_delayed, (0, diff), 'constant')
            # Zoom has extended array size. Crop the end
            elif sub_delayed.shape[0] > sub.shape[0]:
                sub_delayed = sub_delayed[:sub.shape[0]]

            #sub_delayed = np.roll(sub, delay, axis=0)
            sub_delayed = shift(sub_delayed, delay)
            sub_delayed = sub_delayed.astype(np.uint8)
            #if delay > 0:
            #    sub_delayed[:delay] = 0
            #elif delay < 0:
            #    sub_delayed[delay:] = 0
Rémi Bèges's avatar
Rémi Bèges committed
94
            l = log_loss(sub_delayed, speech)
Rémi Bèges's avatar
Rémi Bèges committed
95
            print('Loss: {:.2f} | Delay: {} | FPS ratio: {:.2f}'.format(l, delay, fps))
Rémi Bèges's avatar
Rémi Bèges committed
96 97
            return l
        return loss
98

Rémi Bèges's avatar
Rémi Bèges committed
99 100 101 102 103 104
    # Discretize sub
    sub_prob[sub_prob > 0] = 1
    sub_prob[sub_prob < 1] = 0
    sub_prob = sub_prob.astype(np.float64)
    speech_prob = speech_prob.astype(np.float64)
    loss_fn = loss_fn_factory(speech_prob, sub_prob)
105

106 107 108 109 110 111 112 113 114
    # fmin_l_bfgs_b can be stuck in local minima
    # Which makes the quality of the result highly dependent on the initial conditions
    #result = scipy.optimize.fmin_l_bfgs_b(loss_fn, 1000, approx_grad=True, epsilon=1)
    #delay, final_loss, info = result

    # basinhopping gives random, sometimes inacurate results
    #result = scipy.optimize.basinhopping(loss_fn, 0, T=5000, stepsize=1000)
    #delay = result['x']

Rémi Bèges's avatar
Rémi Bèges committed
115
    yield 'Synchronizing...'
116

117
    # Brute force match to search for a global minimum
118 119 120
    # We use a time span of 3 minutes with a 1s step
    # (3 minutes in case there is a TV show summary that is not in the subs)
    span_s = 3 * 60
121 122
    span = span_s / block
    res = 3.0 / block
Rémi Bèges's avatar
Rémi Bèges committed
123 124
    rrange = (slice(-span , span, res), slice(0.9, 1.1, 0.05))
    print('Range: {}'.format(rrange))
125
    result = scipy.optimize.brute(loss_fn, rrange)
Rémi Bèges's avatar
Rémi Bèges committed
126
    delay, ratio = result
127

Rémi Bèges's avatar
Rémi Bèges committed
128
    print('Optimization result (blocks | ratio): {}'.format(result))
129

Rémi Bèges's avatar
Rémi Bèges committed
130
    delay *= block
131

Rémi Bèges's avatar
Rémi Bèges committed
132
    print('Sub is off by {}s'.format(delay))
Rémi Bèges's avatar
Rémi Bèges committed
133
    subs = resync_sub(sub, offset=int(delay), ratio=ratio)
134

135 136 137
    if save_to is None:
        filename, ext = os.path.splitext(movie)
        save_to = filename + '_synced.srt'
138

139 140
    subs.save(save_to, encoding='utf-8')

Rémi Bèges's avatar
Rémi Bèges committed
141 142 143 144 145 146 147
    msg = (
        "Subtitle synced (delay: {:.2f}s | fps: {:.2f}) and saved.\n"\
        "Just open VLC, select it in the sub list and you're all set.\n"
        "File: {}"
    ).format(delay, ratio, save_to)

    yield msg
148

149

Rémi Bèges's avatar
Rémi Bèges committed
150 151
def main():
    movie = "C:/Users/remib/Downloads/Game.of.Thrones.Season.6.720p.BluRay.x264.ShAaNiG/Game.of.Thrones.S06E04.720p.BluRay.x264.ShAaNiG.mkv"
152
    sub = "C:/Users/remib/Downloads/Game.of.Thrones.Season.6.720p.BluRay.x264.ShAaNiG/Game of Thrones - 06x04 - Book of the Stranger.AVS.TRANS.French.HI.C.updated.Addic7ed.com.srt"
153 154
    for step in sync(movie, sub):
        print('Step: ', step)
Rémi Bèges's avatar
Rémi Bèges committed
155 156 157

if __name__ == '__main__':
    main()