Class: SmTranscript::TtmlReader

Inherits:

Object

Object
SmTranscript::TtmlReader

show all

Defined in:: lib/sm_transcript/ttml_reader.rb

Constant Summary collapse

BASE_DATE = these constants help calculate timed text start, end, and duration times. The choice of BASE_DATE is completely arbitrary. It simply allows us to remove the day, month and year from the calculation.

"1/1/1970"

BASE_TIME =

Time.parse("#{BASE_DATE} 0:0:0")

Instance Attribute Summary collapse

#metadata ⇒ Object readonly

Returns the value of attribute metadata.
#words ⇒ Object readonly

Returns the value of attribute words.

Class Method Summary collapse

.from_file(file_name) ⇒ Object

Instance Method Summary collapse

#get_millisecs(time_val) ⇒ Object

convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date.
#initialize(src_file) ⇒ TtmlReader constructor

A new instance of TtmlReader.
#parse_metadata(src_file) ⇒ Object
#parse_words(src_file) ⇒ Object

Constructor Details

#initialize(src_file) ⇒ `TtmlReader`

# File 'lib/sm_transcript/ttml_reader.rb', line 26

def initialize(src_file)
  @metadata = {}
  @words = []
  parse_metadata(src_file)
  parse_words(src_file)
end

Instance Attribute Details

#metadata ⇒ `Object` (readonly)

Returns the value of attribute metadata.



12
13
14

# File 'lib/sm_transcript/ttml_reader.rb', line 12

def metadata
  @metadata
end

#words ⇒ `Object` (readonly)

Returns the value of attribute words.



13
14
15

# File 'lib/sm_transcript/ttml_reader.rb', line 13

def words
  @words
end

Class Method Details

.from_file(file_name) ⇒ `Object`

# File 'lib/sm_transcript/ttml_reader.rb', line 21

def self.from_file(file_name)
  # p File.expand_path(file_name)
  new(File.open(file_name))
end

Instance Method Details

#get_millisecs(time_val) ⇒ `Object`

convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date. Time.parse() returns seconds.

# File 'lib/sm_transcript/ttml_reader.rb', line 108

def get_millisecs(time_val)
  t = time_val.to_s
  if t.match(/\d+ms/).nil?
    t = (Time.parse("#{BASE_DATE} #{t}") - BASE_TIME) * 1000
  end
  t.to_i
end

#parse_metadata(src_file) ⇒ `Object`



33
34
35

# File 'lib/sm_transcript/ttml_reader.rb', line 33

def parse_metadata(src_file)
  # not currently parsing metadata from ttml files
end

#parse_words(src_file) ⇒ `Object`

# File 'lib/sm_transcript/ttml_reader.rb', line 37

def parse_words(src_file)
  doc = Nokogiri::XML(open(src_file.path))
  # each word/phrase is the content of a <p> element
  doc.css('p').each do |e|
    start_time = get_millisecs(e.attr("begin"))
    duration   = get_millisecs(e.attr("dur"))
    end_time   = get_millisecs(e.attr("end"))

    duration = end_time - start_time unless duration.to_i > 0
    end_time = start_time + duration unless end_time.to_i > 0

    # replace tab characters with space
    phrase = e.inner_html.gsub(/\t/, " ")
    
    # replace <br></br> or <br/> with %
    phrase = phrase.gsub(/(<\/?br ?\/?>)+/, "%")

    # replace newline characters with %
    phrase = phrase.gsub(/\n/, "%")

    # replace series of % characters with single %
    phrase = phrase.gsub(/%+/, "%")

    # remove leading % character
    phrase = phrase.gsub(/^\s*%\s*/, "")

    # remove trailing % characters
    phrase = phrase.gsub(/\s*%\s*$/, "")

    # remove more trailing % characters
    phrase = phrase.gsub(/%\s*$/, "")

    # Split phrase at delimiter to create smaller phrases.  Why? Some
    # ttml files have phrases too long for useful karoke.  Fortunately
    # they're broken by newlines and <br/> elements.  So we take advantage
    # of this to split the phrases into smaller chunks.
    arr = phrase.split(/%/)

    # invariant: arr.length > 0
    if arr.length == 0
      next
    end
    
    if arr.length == 1
      # no break point so pass it through
      @words << SmTranscript::Word.new(start_time, end_time, duration, phrase)
    else
      # we don't know the actual duration, so make each equal.
      phrase_duration = duration / arr.length
      seg_start_time = start_time
      arr.each do |seg|
        end_time = seg_start_time + phrase_duration
        @words << SmTranscript::Word.new(seg_start_time, end_time, phrase_duration, seg)
        seg_start_time = seg_start_time + phrase_duration
      end
    end

    # p "#{start_time}, #{duration}, #{end_time}"
    # p e.content
    # p e.content.scan(/<br\/>/).length
    # this regex selects any HTML tags (the gsub removes it)
    # phrase = e.content.gsub(/<\/?[^>]*>/, "")

    # @words << SmTranscript::Word.new(start_time, end_time, duration, phrase)
  end
end

Class: SmTranscript::TtmlReader

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(src_file) ⇒ TtmlReader

Instance Attribute Details

#metadata ⇒ Object (readonly)

#words ⇒ Object (readonly)

Class Method Details

.from_file(file_name) ⇒ Object