Class: SmTranscript::TtmlReader
- Inherits:
-
Object
- Object
- SmTranscript::TtmlReader
- Defined in:
- lib/sm_transcript/ttml_reader.rb
Constant Summary collapse
- BASE_DATE =
these constants help calculate timed text start, end, and duration times. The choice of BASE_DATE is completely arbitrary. It simply allows us to remove the day, month and year from the calculation.
"1/1/1970"- BASE_TIME =
Time.parse("#{BASE_DATE} 0:0:0")
Instance Attribute Summary collapse
-
#metadata ⇒ Object
readonly
Returns the value of attribute metadata.
-
#words ⇒ Object
readonly
Returns the value of attribute words.
Class Method Summary collapse
Instance Method Summary collapse
-
#get_millisecs(time_val) ⇒ Object
convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date.
-
#initialize(src_file) ⇒ TtmlReader
constructor
A new instance of TtmlReader.
- #parse_metadata(src_file) ⇒ Object
- #parse_words(src_file) ⇒ Object
Constructor Details
#initialize(src_file) ⇒ TtmlReader
26 27 28 29 30 31 |
# File 'lib/sm_transcript/ttml_reader.rb', line 26 def initialize(src_file) @metadata = {} @words = [] (src_file) parse_words(src_file) end |
Instance Attribute Details
#metadata ⇒ Object (readonly)
Returns the value of attribute metadata.
12 13 14 |
# File 'lib/sm_transcript/ttml_reader.rb', line 12 def @metadata end |
#words ⇒ Object (readonly)
Returns the value of attribute words.
13 14 15 |
# File 'lib/sm_transcript/ttml_reader.rb', line 13 def words @words end |
Class Method Details
.from_file(file_name) ⇒ Object
21 22 23 24 |
# File 'lib/sm_transcript/ttml_reader.rb', line 21 def self.from_file(file_name) # p File.expand_path(file_name) new(File.open(file_name)) end |
Instance Method Details
#get_millisecs(time_val) ⇒ Object
convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date. Time.parse() returns seconds.
108 109 110 111 112 113 114 |
# File 'lib/sm_transcript/ttml_reader.rb', line 108 def get_millisecs(time_val) t = time_val.to_s if t.match(/\d+ms/).nil? t = (Time.parse("#{BASE_DATE} #{t}") - BASE_TIME) * 1000 end t.to_i end |
#parse_metadata(src_file) ⇒ Object
33 34 35 |
# File 'lib/sm_transcript/ttml_reader.rb', line 33 def (src_file) # not currently parsing metadata from ttml files end |
#parse_words(src_file) ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/sm_transcript/ttml_reader.rb', line 37 def parse_words(src_file) doc = Nokogiri::XML(open(src_file.path)) # each word/phrase is the content of a <p> element doc.css('p').each do |e| start_time = get_millisecs(e.attr("begin")) duration = get_millisecs(e.attr("dur")) end_time = get_millisecs(e.attr("end")) duration = end_time - start_time unless duration.to_i > 0 end_time = start_time + duration unless end_time.to_i > 0 # replace tab characters with space phrase = e.inner_html.gsub(/\t/, " ") # replace <br></br> or <br/> with % phrase = phrase.gsub(/(<\/?br ?\/?>)+/, "%") # replace newline characters with % phrase = phrase.gsub(/\n/, "%") # replace series of % characters with single % phrase = phrase.gsub(/%+/, "%") # remove leading % character phrase = phrase.gsub(/^\s*%\s*/, "") # remove trailing % characters phrase = phrase.gsub(/\s*%\s*$/, "") # remove more trailing % characters phrase = phrase.gsub(/%\s*$/, "") # Split phrase at delimiter to create smaller phrases. Why? Some # ttml files have phrases too long for useful karoke. Fortunately # they're broken by newlines and <br/> elements. So we take advantage # of this to split the phrases into smaller chunks. arr = phrase.split(/%/) # invariant: arr.length > 0 if arr.length == 0 next end if arr.length == 1 # no break point so pass it through @words << SmTranscript::Word.new(start_time, end_time, duration, phrase) else # we don't know the actual duration, so make each equal. phrase_duration = duration / arr.length seg_start_time = start_time arr.each do |seg| end_time = seg_start_time + phrase_duration @words << SmTranscript::Word.new(seg_start_time, end_time, phrase_duration, seg) seg_start_time = seg_start_time + phrase_duration end end # p "#{start_time}, #{duration}, #{end_time}" # p e.content # p e.content.scan(/<br\/>/).length # this regex selects any HTML tags (the gsub removes it) # phrase = e.content.gsub(/<\/?[^>]*>/, "") # @words << SmTranscript::Word.new(start_time, end_time, duration, phrase) end end |