Class: SmTranscript::SrtReader

Inherits:
Object
  • Object
show all
Defined in:
lib/sm_transcript/srt_reader.rb

Constant Summary collapse

BASE_DATE =

these constants help calculate timed text start, end, and duration times. The choice of BASE_DATE is completely arbitrary. It simply allows us to remove the day, month and year from the calculation.

"1/1/1970"
BASE_TIME =
Time.parse("#{BASE_DATE} 0:0:0")

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(src_file) ⇒ SrtReader

Returns a new instance of SrtReader.



28
29
30
31
32
33
# File 'lib/sm_transcript/srt_reader.rb', line 28

def initialize(src_file)
  @metadata = {}
  @words = []
  ()
  parse_words(src_file)
end

Instance Attribute Details

#metadataObject (readonly)

Returns the value of attribute metadata.



15
16
17
# File 'lib/sm_transcript/srt_reader.rb', line 15

def 
  @metadata
end

#wordsObject (readonly)

Returns the value of attribute words.



16
17
18
# File 'lib/sm_transcript/srt_reader.rb', line 16

def words
  @words
end

Class Method Details

.from_file(file_name) ⇒ Object



24
25
26
# File 'lib/sm_transcript/srt_reader.rb', line 24

def self.from_file(file_name)
  new(File.open(file_name))
end

Instance Method Details

#get_millisecs(time_val) ⇒ Object

convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date. Time.parse() returns seconds.



97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/sm_transcript/srt_reader.rb', line 97

def get_millisecs(time_val)
  t = time_val.to_s
#      if t.match(/\d\d:\d\d:\d\d,\d\d\d/).nil?
#      if t.match(/\d+ms/).nil?
#      before = t
  if (t =~ /(\d\d:\d\d:\d\d([,\.]\d{1,3})?)/).nil?
    t = "00:#{t}"
#        p "#{before} -> #{t}"
  end
    ms = (Time.parse("#{BASE_DATE} #{t}") - BASE_TIME) * 1000
  # end
  # p ms
  ms.to_i
end

#parse_metadataObject



35
36
37
# File 'lib/sm_transcript/srt_reader.rb', line 35

def ()
  # there is no metadata in .srt files
end

#parse_words(src_file) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/sm_transcript/srt_reader.rb', line 39

def parse_words(src_file)
  # Each block of timecoded text is made up of two or more lines. The  
  # first line is a single integer.  The second line contains two 
  # timecodes separated by "-->". Following the timecodes are zero or 
  # more lines of transcript text.
  #  Blocks are separated by a single blank line.
  #
  # The first line of block contains two "timecodes" separated by a comma.
  # The timecodes are in this format: n:nn:nn.nnn. 
  # The next one or more lines contain words separated by spaces.  Each
  # word may contain characters, an apostrophe, or a word preceded by a 
  # percent sign, i.e. "%noise"
  # first line should match 
  # extract starting time code: ln.scan(/(\d*:\d*:\d*,\d*) --> .*/)  
  
  start_time = ''
  phrase = ''
  cntr = 0
  
  src_file.each do |ln|
    cntr += 1
    case ln
    when /^(\d{0,3})$/
#          p ' '
#          p "line: #{cntr}"
#          p "number: #{$1}"
#          p "phrase: #{phrase}"
#          p "start time: #{start_time}"
        @words << SmTranscript::Word.new(get_millisecs(start_time), 0, '', phrase) unless (start_time.length == 0) | (phrase.length == 0)
        phrase = ''
        start_time = ''
        next
        
#          when /(\d\d:\d\d:\d\d[,\.]\d{1,3})( --> .*)?/
#          when /(\d\d:\d\d:\d\d([,\.]\d{1,3})?)( --> .*)?/
      when /((\d\d:)?\d\d:\d\d([,\.]\d{1,3})?)( --> .*)?/
#            /((\d\d:)?\d\d:\d\d[,\.]?\d{0,3})( --> .*)?/
#            @words << SmTranscript::Word.new(get_millisecs(start_time), 0, '', phrase) unless start_time.length == 0
        
        start_time = $1
#            p "start: #{$1}"
      # these are the codes for Creole chars  \xD2\xE8\xF2
#        when /^([A-Za-z0-9',.:\?\(\)\^]+ ?[\w',.:-\?\(\)\^ ]*)/mu
    when /^([\w0-9',.:\?\(\)\^]+ ?[\w',.:-\?\(\)\^ ]*)/mu
        phrase.length == 0 ? phrase = $1 : phrase += " #{$1}"
#            p "phrase:[#{phrase.length}] #{$1} <#{phrase}>"
    end
  end  
  # p "last line: #{cntr}"
  # p "@words length: #{@words.length}"
  @words << SmTranscript::Word.new(get_millisecs(start_time), 0, '', phrase) unless (start_time.length == 0) | (phrase.length == 0)
end