Module: SpeechToText::Util

Included in:
AmazonS2T, GoogleS2T, IbmWatsonS2T, MozillaDeepspeechS2T, SpeechmaticsS2T
Defined in:
lib/speech_to_text/util.rb

Overview

rubocop:disable Style/Documentation

Class Method Summary collapse

Class Method Details

.captions_json(file_path:, file_name:, localeName:, locale:) ⇒ Object

rubocop:disable Naming/UncommunicativeMethodParamName



96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/speech_to_text/util.rb', line 96

def self.captions_json(file_path:,
                       file_name:,
                       # rubocop:disable Naming/VariableName
                       localeName:,
                       # rubocop:enable Naming/VariableName
                       locale:)
  captions_file_name = "#{file_path}/#{file_name}"
  captions_file = File.open(captions_file_name, 'w')
  line = "[{\"localeName\": \"#{localeName}\", \"locale\": \"#{locale}\"}]"
  captions_file.puts line
  captions_file.close
end

.recording_json(file_path:, record_id:, timestamp:, language:) ⇒ Object

rubocop:disable Metrics/MethodLength



111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/speech_to_text/util.rb', line 111

def self.recording_json(file_path:,
                        record_id:,
                        timestamp:,
                        language:)
  filename = "#{file_path}/#{record_id}-#{timestamp}-track.json"
  file = File.open(filename, 'w')
  file.puts '{'
  file.puts "\"record_id\": \"#{record_id}\","
  file.puts '"kind": "subtitles",'
  file.puts "\"lang\": \"#{language}\","
  file.puts '"label": "English",'
  file.puts "\"original_filename\": \"caption_#{language}.vtt\","
  file.puts "\"temp_filename\": \"#{record_id}-#{timestamp}-track.txt\""
  file.puts '}'
  file.close
end

.seconds_to_timestamp(number) ⇒ Object

function to convert the time to a timestamp rubocop:disable Metrics/MethodLength



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/speech_to_text/util.rb', line 17

def self.seconds_to_timestamp(number) # rubocop:disable Metrics/AbcSize
  hh = (number / 3600).floor
  number = number % 3600
  mm = (number / 60).floor
  ss = (number % 60).round(3)
  ss = "0#{ss}" if ss < 10
  parts = ss.to_s.split('.')
  if parts.length > 1
    1.upto(3 - parts[1].length) { parts[1] = parts[1].concat('0') }
    ss = "#{parts[0]}.#{parts[1]}"
  else
    ss = parts[0].concat('.000')
  end
  mm = "0#{mm}" if mm < 10
  hh = "0#{hh}" if hh < 10
  "#{hh}:#{mm}:#{ss}"
end

.video_to_audio(video_file_path:, video_name:, video_content_type:, audio_file_path:, audio_name:, audio_content_type:, **duration) ⇒ Object

rubocop:enable Metrics/MethodLength def video_to_audio rubocop:disable Metrics/ParameterLists



131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/speech_to_text/util.rb', line 131

def self.video_to_audio(video_file_path:,
                        video_name:,
                        video_content_type:,
                        audio_file_path:,
                        audio_name:,
                        audio_content_type:,
                        **duration)
  # rubocop:enable Metrics/ParameterLists
  video_to_audio_command = ''
  if duration.empty?
    video_to_audio_command = "ffmpeg -y -i #{video_file_path}/#{video_name}.#{video_content_type} -ac 1 -ar 16000 #{audio_file_path}/#{audio_name}.#{audio_content_type}"
  elsif duration[:start_time].nil? && duration[:end_time] != nil
    video_to_audio_command = "ffmpeg -y -ss #{0.to_i} -i #{video_file_path}/#{video_name}.#{video_content_type} -t #{duration[:end_time]} -ac 1 -ar 16000 #{audio_file_path}/#{audio_name}.#{audio_content_type}"
  elsif duration[:end_time].nil? && duration[:start_time] != nil
    video_to_audio_command = "ffmpeg -y -ss #{duration[:start_time]} -i #{video_file_path}/#{video_name}.#{video_content_type} -ac 1 -ar 16000 #{audio_file_path}/#{audio_name}.#{audio_content_type}"
  else
    video_to_audio_command = "ffmpeg -y -t #{duration[:end_time]} -i #{video_file_path}/#{video_name}.#{video_content_type} -ss #{duration[:start_time]} -ac 1 -ar 16000 #{audio_file_path}/#{audio_name}.#{audio_content_type}"
  end  

    Open3.popen2e(video_to_audio_command) do |stdin, stdout_err, wait_thr|
      while line = stdout_err.gets
        puts "#{line}"
      end

      exit_status = wait_thr.value
      unless exit_status.success?
        puts '---------------------'
        puts "FAILED to execute --> #{video_to_audio_command}"
        puts '---------------------'
      end
    end

    #Open3.popen3(video_to_audio_command.to_s) do |stdin, stdout, stderr, wait_thr|
    #  puts "stdout is:" + stdout.read
    #  puts "stderr is:" + stderr.read
    #end
end

.write_to_webvtt(vtt_file_path:, vtt_file_name:, text_array:, start_time:) ⇒ Object

rubocop:enable Metrics/MethodLength create and write the webvtt file rubocop:disable Metrics/MethodLength



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/speech_to_text/util.rb', line 37

def self.write_to_webvtt(vtt_file_path:, # rubocop:disable Metrics/AbcSize
                         vtt_file_name:,
                         text_array:,
                         start_time:)
  # Array format 
  # text_array = [start_timestamp, end_timestamp, word, start_time, end_time, word, ...]
  
  # if we cut first few minutes from the audio then 
  # start time will be replaced instead of 0
  start_time = start_time.to_i 

  filename = "#{vtt_file_path}/#{vtt_file_name}"
  file = File.open(filename, 'w')
  file.print "WEBVTT"

  i = block_number = 0
  
  #all the words are at position [2,5,8,11...]
  word_index = 2  

  # one block will give total 10 words on screen at a time
  # which contains total 30 index 
  # each word has 3 indexes in text_array [start_timestamp, end_timestamp, word,...]
  block_size = 30

  # each block contains 10 words index range o to 29
  # last end time will be at index = 28
  end_timestamp = 28

  # we need new lines after every 5 words so 6th word will be at index = 17 (6*3 - 1) 
  line_space_index = 17 

  while i < text_array.length
   
    if i%3 == word_index  #if index has word then print word
      if i%block_size == line_space_index # if this is 6th word then print new line
        file.puts
      end
      file.print "#{text_array[i]} "
    elsif i%block_size == 0  #if index is 0,30,60... means starting a new block
      block_number += 1
      file.puts "\n\n"
      file.puts block_number  #print block number 
      file.print "#{seconds_to_timestamp(text_array[i] + start_time)} "  #print start timestamps
      if i + end_timestamp < text_array.length  # End timestamp will be at 28th index in block of 30 indexes (10 words)
        file.puts "--> #{seconds_to_timestamp(text_array[i+end_timestamp] + start_time)}"
      else  # For last block, there will not be total 30 indexes, so end timestamp will be second last index
        file.puts "--> #{seconds_to_timestamp(text_array[text_array.length - 2] + start_time)}"
      end
    else          
    end
    i += 1
  end

  file.close
end