Class: SmTranscript::SrtReader

Inherits:

Object

Object
SmTranscript::SrtReader

show all

Defined in:: lib/sm_transcript/srt_reader.rb

Constant Summary collapse

BASE_DATE = these constants help calculate timed text start, end, and duration times. The choice of BASE_DATE is completely arbitrary. It simply allows us to remove the day, month and year from the calculation.

"1/1/1970"

BASE_TIME =

Time.parse("#{BASE_DATE} 0:0:0")

Instance Attribute Summary collapse

#metadata ⇒ Object readonly

Returns the value of attribute metadata.
#words ⇒ Object readonly

Returns the value of attribute words.

Class Method Summary collapse

.from_file(file_name) ⇒ Object

Instance Method Summary collapse

#get_millisecs(time_val) ⇒ Object

convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date.
#initialize(src_file) ⇒ SrtReader constructor

A new instance of SrtReader.
#parse_metadata ⇒ Object
#parse_words(src_file) ⇒ Object

Constructor Details

#initialize(src_file) ⇒ `SrtReader`

Returns a new instance of SrtReader.

# File 'lib/sm_transcript/srt_reader.rb', line 28

def initialize(src_file)
  @metadata = {}
  @words = []
  parse_metadata()
  parse_words(src_file)
end

Instance Attribute Details

#metadata ⇒ `Object` (readonly)

Returns the value of attribute metadata.



15
16
17

# File 'lib/sm_transcript/srt_reader.rb', line 15

def metadata
  @metadata
end

#words ⇒ `Object` (readonly)

Returns the value of attribute words.



16
17
18

# File 'lib/sm_transcript/srt_reader.rb', line 16

def words
  @words
end

Class Method Details

.from_file(file_name) ⇒ `Object`



24
25
26

# File 'lib/sm_transcript/srt_reader.rb', line 24

def self.from_file(file_name)
  new(File.open(file_name))
end

Instance Method Details

#get_millisecs(time_val) ⇒ `Object`

convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date. Time.parse() returns seconds.

# File 'lib/sm_transcript/srt_reader.rb', line 97

def get_millisecs(time_val)
  t = time_val.to_s
#      if t.match(/\d\d:\d\d:\d\d,\d\d\d/).nil?
#      if t.match(/\d+ms/).nil?
#      before = t
  if (t =~ /(\d\d:\d\d:\d\d([,\.]\d{1,3})?)/).nil?
    t = "00:#{t}"
#        p "#{before} -> #{t}"
  end
    ms = (Time.parse("#{BASE_DATE} #{t}") - BASE_TIME) * 1000
  # end
  # p ms
  ms.to_i
end

#parse_metadata ⇒ `Object`



35
36
37

# File 'lib/sm_transcript/srt_reader.rb', line 35

def parse_metadata()
  # there is no metadata in .srt files
end

#parse_words(src_file) ⇒ `Object`

# File 'lib/sm_transcript/srt_reader.rb', line 39

def parse_words(src_file)
  # Each block of timecoded text is made up of two or more lines. The  
  # first line is a single integer.  The second line contains two 
  # timecodes separated by "-->". Following the timecodes are zero or 
  # more lines of transcript text.
  #  Blocks are separated by a single blank line.
  #
  # The first line of block contains two "timecodes" separated by a comma.
  # The timecodes are in this format: n:nn:nn.nnn. 
  # The next one or more lines contain words separated by spaces.  Each
  # word may contain characters, an apostrophe, or a word preceded by a 
  # percent sign, i.e. "%noise"
  # first line should match 
  # extract starting time code: ln.scan(/(\d*:\d*:\d*,\d*) --> .*/)  
  
  start_time = ''
  phrase = ''
  cntr = 0
  
  src_file.each do |ln|
    cntr += 1
    case ln
    when /^(\d{0,3})$/
#          p ' '
#          p "line: #{cntr}"
#          p "number: #{$1}"
#          p "phrase: #{phrase}"
#          p "start time: #{start_time}"
        @words << SmTranscript::Word.new(get_millisecs(start_time), 0, '', phrase) unless (start_time.length == 0) | (phrase.length == 0)
        phrase = ''
        start_time = ''
        next
        
#          when /(\d\d:\d\d:\d\d[,\.]\d{1,3})( --> .*)?/
#          when /(\d\d:\d\d:\d\d([,\.]\d{1,3})?)( --> .*)?/
      when /((\d\d:)?\d\d:\d\d([,\.]\d{1,3})?)( --> .*)?/
#            /((\d\d:)?\d\d:\d\d[,\.]?\d{0,3})( --> .*)?/
#            @words << SmTranscript::Word.new(get_millisecs(start_time), 0, '', phrase) unless start_time.length == 0
        
        start_time = $1
#            p "start: #{$1}"
      # these are the codes for Creole chars  \xD2\xE8\xF2
#        when /^([A-Za-z0-9',.:\?\(\)\^]+ ?[\w',.:-\?\(\)\^ ]*)/mu
    when /^([\w0-9',.:\?\(\)\^]+ ?[\w',.:-\?\(\)\^ ]*)/mu
        phrase.length == 0 ? phrase = $1 : phrase += " #{$1}"
#            p "phrase:[#{phrase.length}] #{$1} <#{phrase}>"
    end
  end  
  # p "last line: #{cntr}"
  # p "@words length: #{@words.length}"
  @words << SmTranscript::Word.new(get_millisecs(start_time), 0, '', phrase) unless (start_time.length == 0) | (phrase.length == 0)
end

Class: SmTranscript::SrtReader

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(src_file) ⇒ SrtReader

Instance Attribute Details

#metadata ⇒ Object (readonly)

#words ⇒ Object (readonly)

Class Method Details

.from_file(file_name) ⇒ Object