Class: SmTranscript::SrtReader
- Inherits:
-
Object
- Object
- SmTranscript::SrtReader
- Defined in:
- lib/sm_transcript/srt_reader.rb
Constant Summary collapse
- BASE_DATE =
these constants help calculate timed text start, end, and duration times. The choice of BASE_DATE is completely arbitrary. It simply allows us to remove the day, month and year from the calculation.
"1/1/1970"- BASE_TIME =
Time.parse("#{BASE_DATE} 0:0:0")
Instance Attribute Summary collapse
-
#metadata ⇒ Object
readonly
Returns the value of attribute metadata.
-
#words ⇒ Object
readonly
Returns the value of attribute words.
Class Method Summary collapse
Instance Method Summary collapse
-
#get_millisecs(time_val) ⇒ Object
convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date.
-
#initialize(src_file) ⇒ SrtReader
constructor
A new instance of SrtReader.
- #parse_metadata ⇒ Object
- #parse_words(src_file) ⇒ Object
Constructor Details
#initialize(src_file) ⇒ SrtReader
Returns a new instance of SrtReader.
28 29 30 31 32 33 |
# File 'lib/sm_transcript/srt_reader.rb', line 28 def initialize(src_file) @metadata = {} @words = [] () parse_words(src_file) end |
Instance Attribute Details
#metadata ⇒ Object (readonly)
Returns the value of attribute metadata.
15 16 17 |
# File 'lib/sm_transcript/srt_reader.rb', line 15 def @metadata end |
#words ⇒ Object (readonly)
Returns the value of attribute words.
16 17 18 |
# File 'lib/sm_transcript/srt_reader.rb', line 16 def words @words end |
Class Method Details
.from_file(file_name) ⇒ Object
24 25 26 |
# File 'lib/sm_transcript/srt_reader.rb', line 24 def self.from_file(file_name) new(File.open(file_name)) end |
Instance Method Details
#get_millisecs(time_val) ⇒ Object
convert a rfc2822 formated string representing hours, minutes, and seconds into an integer representing the equivalent number of milliseconds on an undefined date. Time.parse() returns seconds.
97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/sm_transcript/srt_reader.rb', line 97 def get_millisecs(time_val) t = time_val.to_s # if t.match(/\d\d:\d\d:\d\d,\d\d\d/).nil? # if t.match(/\d+ms/).nil? # before = t if (t =~ /(\d\d:\d\d:\d\d([,\.]\d{1,3})?)/).nil? t = "00:#{t}" # p "#{before} -> #{t}" end ms = (Time.parse("#{BASE_DATE} #{t}") - BASE_TIME) * 1000 # end # p ms ms.to_i end |
#parse_metadata ⇒ Object
35 36 37 |
# File 'lib/sm_transcript/srt_reader.rb', line 35 def () # there is no metadata in .srt files end |
#parse_words(src_file) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/sm_transcript/srt_reader.rb', line 39 def parse_words(src_file) # Each block of timecoded text is made up of two or more lines. The # first line is a single integer. The second line contains two # timecodes separated by "-->". Following the timecodes are zero or # more lines of transcript text. # Blocks are separated by a single blank line. # # The first line of block contains two "timecodes" separated by a comma. # The timecodes are in this format: n:nn:nn.nnn. # The next one or more lines contain words separated by spaces. Each # word may contain characters, an apostrophe, or a word preceded by a # percent sign, i.e. "%noise" # first line should match # extract starting time code: ln.scan(/(\d*:\d*:\d*,\d*) --> .*/) start_time = '' phrase = '' cntr = 0 src_file.each do |ln| cntr += 1 case ln when /^(\d{0,3})$/ # p ' ' # p "line: #{cntr}" # p "number: #{$1}" # p "phrase: #{phrase}" # p "start time: #{start_time}" @words << SmTranscript::Word.new(get_millisecs(start_time), 0, '', phrase) unless (start_time.length == 0) | (phrase.length == 0) phrase = '' start_time = '' next # when /(\d\d:\d\d:\d\d[,\.]\d{1,3})( --> .*)?/ # when /(\d\d:\d\d:\d\d([,\.]\d{1,3})?)( --> .*)?/ when /((\d\d:)?\d\d:\d\d([,\.]\d{1,3})?)( --> .*)?/ # /((\d\d:)?\d\d:\d\d[,\.]?\d{0,3})( --> .*)?/ # @words << SmTranscript::Word.new(get_millisecs(start_time), 0, '', phrase) unless start_time.length == 0 start_time = $1 # p "start: #{$1}" # these are the codes for Creole chars \xD2\xE8\xF2 # when /^([A-Za-z0-9',.:\?\(\)\^]+ ?[\w',.:-\?\(\)\^ ]*)/mu when /^([\w0-9',.:\?\(\)\^]+ ?[\w',.:-\?\(\)\^ ]*)/mu phrase.length == 0 ? phrase = $1 : phrase += " #{$1}" # p "phrase:[#{phrase.length}] #{$1} <#{phrase}>" end end # p "last line: #{cntr}" # p "@words length: #{@words.length}" @words << SmTranscript::Word.new(get_millisecs(start_time), 0, '', phrase) unless (start_time.length == 0) | (phrase.length == 0) end |