Class: SmTranscript::TtmlReader

Inherits:
Object
  • Object
show all
Defined in:
lib/sm_transcript/ttml_reader.rb

Constant Summary collapse

BASE_DATE =

these constants help calculate timed text start, end, and duration times. The choice of BASE_DATE is completely arbitrary. It simply allows us to remove the day, month and year from the calculation.

"1/1/1970"
BASE_TIME =
Time.parse("#{BASE_DATE} 0:0:0")

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(src_file) ⇒ TtmlReader

Returns a new instance of TtmlReader.



28
29
30
31
32
33
# File 'lib/sm_transcript/ttml_reader.rb', line 28

def initialize(src_file)
   = {}
  @words = []
  (src_file)
  parse_words(src_file)
end

Instance Attribute Details

#metadataObject (readonly)

Returns the value of attribute metadata.



14
15
16
# File 'lib/sm_transcript/ttml_reader.rb', line 14

def 
  
end

#wordsObject (readonly)

Returns the value of attribute words.



15
16
17
# File 'lib/sm_transcript/ttml_reader.rb', line 15

def words
  @words
end

Class Method Details

.from_file(file_name) ⇒ Object



23
24
25
26
# File 'lib/sm_transcript/ttml_reader.rb', line 23

def self.from_file(file_name)
  # p File.expand_path(file_name)
  new(File.open(file_name))
end

Instance Method Details

#get_millisecs(time_val) ⇒ Object

convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date. Time.parse() returns seconds.



110
111
112
113
114
115
116
# File 'lib/sm_transcript/ttml_reader.rb', line 110

def get_millisecs(time_val)
  t = time_val.to_s
  if t.match(/\d+ms/).nil?
    t = (Time.parse("#{BASE_DATE} #{t}") - BASE_TIME) * 1000
  end
  t.to_i
end

#parse_metadata(src_file) ⇒ Object



35
36
37
# File 'lib/sm_transcript/ttml_reader.rb', line 35

def (src_file)
  # not currently parsing metadata from ttml files
end

#parse_words(src_file) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/sm_transcript/ttml_reader.rb', line 39

def parse_words(src_file)
  doc = Nokogiri::XML(open(src_file.path))
  # each word/phrase is the content of a <p> element
  doc.css('p').each do |e|
    start_time = get_millisecs(e.attr("begin"))
    duration   = get_millisecs(e.attr("dur"))
    end_time   = get_millisecs(e.attr("end"))

    duration = end_time - start_time unless duration.to_i > 0
    end_time = start_time + duration unless end_time.to_i > 0

    # replace tab characters with space
    phrase = e.inner_html.gsub(/\t/, " ")
    
    # replace <br></br> or <br/> with %
    phrase = phrase.gsub(/(<\/?br ?\/?>)+/, "%")

    # replace newline characters with %
    phrase = phrase.gsub(/\n/, "%")

    # replace series of % characters with single %
    phrase = phrase.gsub(/%+/, "%")

    # remove leading % character
    phrase = phrase.gsub(/^\s*%\s*/, "")

    # remove trailing % characters
    phrase = phrase.gsub(/\s*%\s*$/, "")

    # remove more trailing % characters
    phrase = phrase.gsub(/%\s*$/, "")

    # Split phrase at delimiter to create smaller phrases.  Why? Some
    # ttml files have phrases too long for useful karoke.  Fortunately
    # they're broken by newlines and <br/> elements.  So we take advantage
    # of this to split the phrases into smaller chunks.
    arr = phrase.split(/%/)

    # invariant: arr.length > 0
    if arr.length == 0
      next
    end
    
    if arr.length == 1
      # no break point so pass it through
      @words << SmTranscript::Word.new(start_time, end_time, duration, phrase)
    else
      # we don't know the actual duration, so make each equal.
      phrase_duration = duration / arr.length
      seg_start_time = start_time
      arr.each do |seg|
        end_time = seg_start_time + phrase_duration
        @words << SmTranscript::Word.new(seg_start_time, end_time, phrase_duration, seg)
        seg_start_time = seg_start_time + phrase_duration
      end
    end

    # p "#{start_time}, #{duration}, #{end_time}"
    # p e.content
    # p e.content.scan(/<br\/>/).length
    # this regex selects any HTML tags (the gsub removes it)
    # phrase = e.content.gsub(/<\/?[^>]*>/, "")

    # @words << SmTranscript::Word.new(start_time, end_time, duration, phrase)
  end
end