Class: SmTranscript::TtmlReader
- Inherits:
-
Object
- Object
- SmTranscript::TtmlReader
- Defined in:
- lib/sm_transcript/ttml_reader.rb
Constant Summary collapse
- BASE_DATE =
these constants help calculate timed text start, end, and duration times. The choice of BASE_DATE is completely arbitrary. It simply allows us to remove the day, month and year from the calculation.
"1/1/1970"- BASE_TIME =
Time.parse("#{BASE_DATE} 0:0:0")
Instance Attribute Summary collapse
-
#metadata ⇒ Object
readonly
Returns the value of attribute metadata.
-
#words ⇒ Object
readonly
Returns the value of attribute words.
Class Method Summary collapse
Instance Method Summary collapse
-
#get_millisecs(time_val) ⇒ Object
convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date.
-
#initialize(src_file) ⇒ TtmlReader
constructor
A new instance of TtmlReader.
- #parse_metadata(src_file) ⇒ Object
- #parse_words(src_file) ⇒ Object
Constructor Details
#initialize(src_file) ⇒ TtmlReader
Returns a new instance of TtmlReader.
28 29 30 31 32 33 |
# File 'lib/sm_transcript/ttml_reader.rb', line 28 def initialize(src_file) = {} @words = [] (src_file) parse_words(src_file) end |
Instance Attribute Details
#metadata ⇒ Object (readonly)
Returns the value of attribute metadata.
14 15 16 |
# File 'lib/sm_transcript/ttml_reader.rb', line 14 def end |
#words ⇒ Object (readonly)
Returns the value of attribute words.
15 16 17 |
# File 'lib/sm_transcript/ttml_reader.rb', line 15 def words @words end |
Class Method Details
.from_file(file_name) ⇒ Object
23 24 25 26 |
# File 'lib/sm_transcript/ttml_reader.rb', line 23 def self.from_file(file_name) # p File.expand_path(file_name) new(File.open(file_name)) end |
Instance Method Details
#get_millisecs(time_val) ⇒ Object
convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date. Time.parse() returns seconds.
110 111 112 113 114 115 116 |
# File 'lib/sm_transcript/ttml_reader.rb', line 110 def get_millisecs(time_val) t = time_val.to_s if t.match(/\d+ms/).nil? t = (Time.parse("#{BASE_DATE} #{t}") - BASE_TIME) * 1000 end t.to_i end |
#parse_metadata(src_file) ⇒ Object
35 36 37 |
# File 'lib/sm_transcript/ttml_reader.rb', line 35 def (src_file) # not currently parsing metadata from ttml files end |
#parse_words(src_file) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/sm_transcript/ttml_reader.rb', line 39 def parse_words(src_file) doc = Nokogiri::XML(open(src_file.path)) # each word/phrase is the content of a <p> element doc.css('p').each do |e| start_time = get_millisecs(e.attr("begin")) duration = get_millisecs(e.attr("dur")) end_time = get_millisecs(e.attr("end")) duration = end_time - start_time unless duration.to_i > 0 end_time = start_time + duration unless end_time.to_i > 0 # replace tab characters with space phrase = e.inner_html.gsub(/\t/, " ") # replace <br></br> or <br/> with % phrase = phrase.gsub(/(<\/?br ?\/?>)+/, "%") # replace newline characters with % phrase = phrase.gsub(/\n/, "%") # replace series of % characters with single % phrase = phrase.gsub(/%+/, "%") # remove leading % character phrase = phrase.gsub(/^\s*%\s*/, "") # remove trailing % characters phrase = phrase.gsub(/\s*%\s*$/, "") # remove more trailing % characters phrase = phrase.gsub(/%\s*$/, "") # Split phrase at delimiter to create smaller phrases. Why? Some # ttml files have phrases too long for useful karoke. Fortunately # they're broken by newlines and <br/> elements. So we take advantage # of this to split the phrases into smaller chunks. arr = phrase.split(/%/) # invariant: arr.length > 0 if arr.length == 0 next end if arr.length == 1 # no break point so pass it through @words << SmTranscript::Word.new(start_time, end_time, duration, phrase) else # we don't know the actual duration, so make each equal. phrase_duration = duration / arr.length seg_start_time = start_time arr.each do |seg| end_time = seg_start_time + phrase_duration @words << SmTranscript::Word.new(seg_start_time, end_time, phrase_duration, seg) seg_start_time = seg_start_time + phrase_duration end end # p "#{start_time}, #{duration}, #{end_time}" # p e.content # p e.content.scan(/<br\/>/).length # this regex selects any HTML tags (the gsub removes it) # phrase = e.content.gsub(/<\/?[^>]*>/, "") # @words << SmTranscript::Word.new(start_time, end_time, duration, phrase) end end |