Class: YoutubeTranscript2020

Inherits:
Object
  • Object
show all
Defined in:
lib/youtube_transcript2020.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(id = nil, debug: false) ⇒ YoutubeTranscript2020

Returns a new instance of YoutubeTranscript2020.



15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/youtube_transcript2020.rb', line 15

def initialize(id=nil, debug: false)  

  return unless id
  
  @debug = debug

  @id = id[/https?:\/\//] ? YoutubeID.from(id) : id

  s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}"))
  @s = parse(s) unless s.empty?

  fetch_info(@id)

end

Instance Attribute Details

#authorObject (readonly)

Returns the value of attribute author.



13
14
15
# File 'lib/youtube_transcript2020.rb', line 13

def author
  @author
end

#idObject (readonly)

Returns the value of attribute id.



13
14
15
# File 'lib/youtube_transcript2020.rb', line 13

def id
  @id
end

#titleObject (readonly)

Returns the value of attribute title.



13
14
15
# File 'lib/youtube_transcript2020.rb', line 13

def title
  @title
end

#to_aObject (readonly)

Returns the value of attribute to_a.



13
14
15
# File 'lib/youtube_transcript2020.rb', line 13

def to_a
  @to_a
end

Instance Method Details

#import(obj) ⇒ Object

reads a plain text transcript which has been modified to include headings



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/youtube_transcript2020.rb', line 48

def import(obj)

  s = RXFHelper.read(obj).first

  if s =~ /------+/ then
    header, body = s.split(/-----+/,2)

    h = SimpleConfig.new(header).to_h
    @id, @author, @title = h[:id], h[:author], h[:title]
    @s = body
  else
    body = obj
    raw_transcript = true
  end

  puts 'body: ' + body[0..400] if @debug
  a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ }    
  @a = a[0].zip(a[1])

  @s = join_sentences(@a) if raw_transcript

end

#to_headingsObject

Outputs plain text containing the headings including timestamps note: This can be helpful for copyng and pasting directly into a YouTube comment



119
120
121
122
123
# File 'lib/youtube_transcript2020.rb', line 119

def to_headings()    
  
  @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first)    

end

#to_htmlObject

Outputs HTML containing the embedded video and transcription



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/youtube_transcript2020.rb', line 73

def to_html()

  url = 'https://www.youtube.com/embed/' + @id

  links = @a.map do |timestamp, s|
    
    seconds = Subunit.new(units={minutes:60, hours:60}, 
                timestamp.split(':').map(&:to_i)).to_i
    "<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \
        % [url, seconds, timestamp, s]
  end
  
  puts '@html_embed: ' + @html_embed.inspect if @debug
  doc = Rexle.new(@html_embed.to_s)
  puts 'before attributes'
  doc.root.attributes[:name] = 'video'
  embed = doc.xml(declaration: false)
  puts 'embed: ' + embed.inspect if @debug
  #embed = @html_embed

"<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n  <title></title>\n  <meta charset=\"utf-8\" />\n</head>\n<body>\n<div style=\"width: 1080px; background: white\">\n<div style=\"float:left; width: 580px; background: white\">\n\#{embed}\n<h1>\#{@title}</h1>\n</div>\n<div style=\"float:right; width: 500px; overflow-y: scroll; height: 400px\">\n<ul>\#{links.join(\"\\n\")}</ul>\n</div>\n\n</div>\n</body>\n</html>\n"
end

#to_keywords(level: 2) ⇒ Object

returns a Hash object containing the frequenecy of each word level: 2 (ignores commond words including stop words) level: 3 (ignores dictionary words)



129
130
131
# File 'lib/youtube_transcript2020.rb', line 129

def to_keywords(level: 2)
  Yawc.new(self.to_text(), level: level).to_h
end

#to_sObject

returns the transcript in plain text including timestamps



36
37
38
39
40
# File 'lib/youtube_transcript2020.rb', line 36

def to_s()

  h = {id: @id, title: @title, author: @author}
  SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s
end

#to_textObject



42
43
44
# File 'lib/youtube_transcript2020.rb', line 42

def to_text()
  @a.map(&:last).join("\n")
end