sync.py 5.73 KB
Newer Older
1
2
3
4
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from scipy.signal import medfilt
Rémi Bèges's avatar
Rémi Bèges committed
5
6
from scipy.ndimage.interpolation import shift
from scipy.ndimage import zoom
7
8
from sklearn.metrics import log_loss
import scipy
Rémi Bèges's avatar
Rémi Bèges committed
9
import warnings
10

Rémi Bèges's avatar
Rémi Bèges committed
11
12
from autosync.dataset import *
from autosync.utils import *
13

14
15
16
17
18
19
20
21
22
def sync(movie, sub, save_to=None):
    '''
    Synchronizes subtitle with movie's audio

    Attributes:
        movie       Filename of the media to extract audio from
        sub         Filename of the subtitle to synchronize
        save_to     Export filename for the synced subtitle.
    '''
Rémi Bèges's avatar
Rémi Bèges committed
23
    yield 'Extracting audio...'
Rémi Bèges's avatar
Rémi Bèges committed
24
    test_audio = fetch_dataset_slices(movie, test_percent=0, store_wav_file=False, globbing=False)
25
26

    yield 'Analyzing audio...'
Rémi Bèges's avatar
Rémi Bèges committed
27
    folder = os.path.dirname(os.path.dirname(__file__))
Rémi Bèges's avatar
Rémi Bèges committed
28
29
30
31
32
33
    graph = tf.Graph()
    with graph.as_default():
        with tf.Session() as sess:
            meta_graph_def = tf.saved_model.loader.load(
                sess,
                [tf.saved_model.tag_constants.SERVING],
Rémi Bèges's avatar
Rémi Bèges committed
34
                os.path.join(folder, 'saved_models', 'm17_dataclean_300k_balanced'),
Rémi Bèges's avatar
Rémi Bèges committed
35
36
37
38
39
40
            )
            signature = meta_graph_def.signature_def
            signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
            print('Model signature:', signature)
            x_tensor_name = signature[signature_key].inputs['audio_mfcc'].name
            y_tensor_name = signature[signature_key].outputs['has_speech'].name
41

Rémi Bèges's avatar
Rémi Bèges committed
42
43
            x = sess.graph.get_tensor_by_name(x_tensor_name)
            y = sess.graph.get_tensor_by_name(y_tensor_name)
44

Rémi Bèges's avatar
Rémi Bèges committed
45
46
47
            result = sess.run(y, feed_dict={
                x: test_audio,
            })
48
49


Rémi Bèges's avatar
Rémi Bèges committed
50
51
52
53
    block = 0.032
    block_ms = block * 1000
    framerate = 16000
    block_len = int(block_ms * framerate / 1000)
54

Rémi Bèges's avatar
Rémi Bèges committed
55
56
    xaxis = np.arange(0, result.shape[0], 1)
    xaxis = xaxis * block
57

Rémi Bèges's avatar
Rémi Bèges committed
58
    yield 'Parsing subtitle...'
Rémi Bèges's avatar
Rémi Bèges committed
59
60
61
62
63
    speech_prob = result[:, 0]
    sub_prob = sub_to_sequence(sub, fps=16000, total_length=speech_prob.shape[0] * block_len)
    sub_prob = sub_prob.reshape((sub_prob.shape[0] // block_len, block_len))
    sub_prob = contains_speech(sub_prob, block_ms)
    sub_prob = sub_prob[:, 0]
64

Rémi Bèges's avatar
Rémi Bèges committed
65
    def loss_fn_factory(speech, sub):
Rémi Bèges's avatar
Rémi Bèges committed
66
67
68
69
70
71
72
        def loss(params):
            #import pdb; pdb.set_trace()
            delay, fps = params
            delay = int(delay)

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
73
74
75
76
77
                # Hypothesis is 1st order performs better than default 3rd order
                sub_delayed = zoom(sub, fps, order=1)
                # Rounding avoid truncation when performing type conversion to uint8 below
                # And results in a more accurate result
                sub_delayed = np.round(sub_delayed)
Rémi Bèges's avatar
Rémi Bèges committed
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

            # Zoom has reduced array size. Pad with 0s at the end
            if sub_delayed.shape[0] < sub.shape[0]:
                diff = sub.shape[0] - sub_delayed.shape[0]
                sub_delayed = np.pad(sub_delayed, (0, diff), 'constant')
            # Zoom has extended array size. Crop the end
            elif sub_delayed.shape[0] > sub.shape[0]:
                sub_delayed = sub_delayed[:sub.shape[0]]

            #sub_delayed = np.roll(sub, delay, axis=0)
            sub_delayed = shift(sub_delayed, delay)
            sub_delayed = sub_delayed.astype(np.uint8)
            #if delay > 0:
            #    sub_delayed[:delay] = 0
            #elif delay < 0:
            #    sub_delayed[delay:] = 0
Rémi Bèges's avatar
Rémi Bèges committed
94
            l = log_loss(sub_delayed, speech)
Rémi Bèges's avatar
Rémi Bèges committed
95
            print('Loss: {:.2f} | Delay: {} | FPS ratio: {:.2f}'.format(l, delay, fps))
Rémi Bèges's avatar
Rémi Bèges committed
96
97
            return l
        return loss
98

Rémi Bèges's avatar
Rémi Bèges committed
99
100
101
102
103
104
    # Discretize sub
    sub_prob[sub_prob > 0] = 1
    sub_prob[sub_prob < 1] = 0
    sub_prob = sub_prob.astype(np.float64)
    speech_prob = speech_prob.astype(np.float64)
    loss_fn = loss_fn_factory(speech_prob, sub_prob)
105

106
107
108
109
110
111
112
113
114
    # fmin_l_bfgs_b can be stuck in local minima
    # Which makes the quality of the result highly dependent on the initial conditions
    #result = scipy.optimize.fmin_l_bfgs_b(loss_fn, 1000, approx_grad=True, epsilon=1)
    #delay, final_loss, info = result

    # basinhopping gives random, sometimes inacurate results
    #result = scipy.optimize.basinhopping(loss_fn, 0, T=5000, stepsize=1000)
    #delay = result['x']

Rémi Bèges's avatar
Rémi Bèges committed
115
    yield 'Synchronizing...'
116

117
    # Brute force match to search for a global minimum
118
119
120
    # We use a time span of 3 minutes with a 1s step
    # (3 minutes in case there is a TV show summary that is not in the subs)
    span_s = 3 * 60
121
122
    span = span_s / block
    res = 3.0 / block
Rémi Bèges's avatar
Rémi Bèges committed
123
124
    rrange = (slice(-span , span, res), slice(0.9, 1.1, 0.05))
    print('Range: {}'.format(rrange))
125
    result = scipy.optimize.brute(loss_fn, rrange)
Rémi Bèges's avatar
Rémi Bèges committed
126
    delay, ratio = result
127

Rémi Bèges's avatar
Rémi Bèges committed
128
    print('Optimization result (blocks | ratio): {}'.format(result))
129

Rémi Bèges's avatar
Rémi Bèges committed
130
    delay *= block
131

Rémi Bèges's avatar
Rémi Bèges committed
132
    print('Sub is off by {}s'.format(delay))
Rémi Bèges's avatar
Rémi Bèges committed
133
    subs = resync_sub(sub, offset=int(delay), ratio=ratio)
134

135
136
137
    if save_to is None:
        filename, ext = os.path.splitext(movie)
        save_to = filename + '_synced.srt'
138

139
140
    subs.save(save_to, encoding='utf-8')

Rémi Bèges's avatar
Rémi Bèges committed
141
142
143
144
145
146
147
    msg = (
        "Subtitle synced (delay: {:.2f}s | fps: {:.2f}) and saved.\n"\
        "Just open VLC, select it in the sub list and you're all set.\n"
        "File: {}"
    ).format(delay, ratio, save_to)

    yield msg
148

149

Rémi Bèges's avatar
Rémi Bèges committed
150
151
def main():
    movie = "C:/Users/remib/Downloads/Game.of.Thrones.Season.6.720p.BluRay.x264.ShAaNiG/Game.of.Thrones.S06E04.720p.BluRay.x264.ShAaNiG.mkv"
152
    sub = "C:/Users/remib/Downloads/Game.of.Thrones.Season.6.720p.BluRay.x264.ShAaNiG/Game of Thrones - 06x04 - Book of the Stranger.AVS.TRANS.French.HI.C.updated.Addic7ed.com.srt"
153
154
    for step in sync(movie, sub):
        print('Step: ', step)
Rémi Bèges's avatar
Rémi Bèges committed
155
156
157

if __name__ == '__main__':
    main()