Class: Debugher::Debugger

Inherits:
Object
  • Object
show all
Defined in:
lib/debugher.rb

Constant Summary collapse

FILE_TYPES =
['.mp3', '.m4a', '.MP3']

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Debugger

pass a url as a string to initialize



14
15
16
17
18
19
20
# File 'lib/debugher.rb', line 14

def initialize(url)
  $stdout.sync = true
  @uri = URI.parse(url)
  @url = @uri.class == URI::HTTP ? url : "http://#{url}"
  @uri = URI.parse(@url)
  @opened_url = open_url
end

Instance Attribute Details

#urlObject

Returns the value of attribute url.



11
12
13
# File 'lib/debugher.rb', line 11

def url
  @url
end

Class Method Details

.get_soundcloud_url(url) ⇒ Object

Extract the URL element of a soundcloud embed in order to grab the link to the track.

Example:

>> Debugger.get_soundcloud_url("https://w.soundcloud.com/player/?url=http%3A%2F%2Fapi.soundcloud.com%2Ftracks%2F59422468")
=> http://api.soundcloud.com/tracks/59422468

Arguments:

url: (String)


435
436
437
438
439
440
441
442
443
444
445
446
447
# File 'lib/debugher.rb', line 435

def self.get_soundcloud_url(url)
  begin
    uri = URI.parse(url)
    new_url = uri.query.split("&").reject { |q| q[0..2] != "url"}[0]
    new_url = CGI.unescape(new_url[4..new_url.length])

    if Debugger.soundcloud_url?(new_url)
      return new_url
    end
  rescue
    $stderr.puts "Bad URL - Soundcloud URL's don't cause errors so safe to assume it's not a Soundcloud link."
  end
end

.mailto_link?(url) ⇒ Boolean

Check if a string is a mailto link

Example:

>> Debugger.mailto_link?("mailto:[email protected]")
=> true

Arguments:

url: (String)

Returns:

  • (Boolean)


423
424
425
# File 'lib/debugher.rb', line 423

def self.mailto_link?(url)
  url[0..5] == "mailto"
end

.make_absolute(url, base_url = nil) ⇒ Object

Make a URL absolute

Example:

>> Debugger.make_absolute("/about", "http://wearepandr.com")
=> http://wearepandr.com/about

Arguments:

url: (String)
base_url: (String)


383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
# File 'lib/debugher.rb', line 383

def self.make_absolute(url, base_url=nil)
  if Debugger.relative?(url)
    begin
      if !base_url.nil?
        base_url = Debugger.new(base_url).canonical_url
      else
        base_url = canonical_url
      end

      url = Debugger.stitch_to_make_absolute(base_url, url)
    rescue Exception => e
      url = nil
      $stderr.puts "Debugger Error: #{url} - #{e}"
      puts "ERROR: Could not make this URL absolute. Set to nil."
    end
  end
  return url
end

.relative?(url) ⇒ Boolean

Check if a URL is relative or not

Example:

>> Debugger.relative?("http://wearepandr.com")
=> false

Arguments:

url: (String)

Returns:

  • (Boolean)


365
366
367
368
369
370
371
372
# File 'lib/debugher.rb', line 365

def self.relative?(url)
  begin
    @addressable_url = Addressable::URI.parse(url)
    return @addressable_url.relative?
  rescue
    return false
  end
end

.soundcloud_url?(url) ⇒ Boolean

Check if a string is a Soundcloud URL

Example:

>> Debugger.soundcloud_url?("http://api.soundcloud.com/tracks/59422468")
=> http://api.soundcloud.com/tracks/59422468

Arguments:

url: (String)

Returns:

  • (Boolean)


457
458
459
# File 'lib/debugher.rb', line 457

def self.soundcloud_url?(url)
  url.include?("api.soundcloud.com")
end

.stitch_to_make_absolute(canonical_url, path) ⇒ Object

Stitch two strings together to make a single absolute url

Example:

>> Debugger.stitch_to_make_absolute("http://wearepandr.com/", "/about")
=> http://wearepandr.com/about

Arguments:

canonical_url: (String)
path: (String)


411
412
413
# File 'lib/debugher.rb', line 411

def self.stitch_to_make_absolute(canonical_url, path)
  canonical_url.chomp("/") + path
end

.user_agent(ua = "Rakkit") ⇒ Object

Get the user agent

Example:

>> Debugger.user_agent("PANDR")
=> PANDR/V0.1

Arguments:

ua: (String)


344
345
346
# File 'lib/debugher.rb', line 344

def self.user_agent(ua="Rakkit")
  "#{ua}/V#{Debugher::VERSION}"
end

.valid_url?(url) ⇒ Boolean

Check if a url is a valid url

Example:

>> Debugger.valid_url?("http://wearepandr.com")
=> true

Arguments:

url: (String)

Returns:

  • (Boolean)


469
470
471
# File 'lib/debugher.rb', line 469

def self.valid_url?(url)
  !(url =~ URI::regexp).nil?
end

.versionObject

Get the current version

Example:

>> Debugger.version
=> V0.1


353
354
355
# File 'lib/debugher.rb', line 353

def self.version
  "V#{Debugher::VERSION}"
end

Instance Method Details

#atom_feed_urlObject

Get the Atom Feed URL

Example:

>> Debugger.new("http://wearepandr.com").atom_feed_url
=> http://wearepandr.com/feed


99
100
101
102
103
104
105
# File 'lib/debugher.rb', line 99

def atom_feed_url
  atom_url = page.search("link[@type='application/atom+xml']")
  atom_url = atom_url.length == 0 ? nil : atom_url.first['href']

  atom_url = Debugger.stitch_to_make_absolute(canonical_url, atom_url) if Debugger.relative?(atom_url)
  return atom_url.to_s
end

#canonical_urlObject

Get the canonical url of the page

Example:

>> Debugger.new("http://rakkit.com").response_code
=> http://rakkit.com/


63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/debugher.rb', line 63

def canonical_url
  begin
    canonical_uri = @uri
    canonical_uri.path  = ''
    canonical_uri.query = nil
    canonical_uri = canonical_uri + "/"
    return canonical_uri.to_s
  rescue Exception => e
    puts "CANONICAL ERROR: #{e}"
    puts @uri.inspect.to_s
  end
end

#charsetObject

Get the pages charset

Example:

>> Debugger.new("http://wearepandr.com").charset
=> utf-8


314
315
316
# File 'lib/debugher.rb', line 314

def charset
  @opened_url.charset
end

#content_encodingObject

Get the pages content encoding

Example:

>> Debugger.new("http://wearepandr.com").content_encoding
=> []


323
324
325
# File 'lib/debugher.rb', line 323

def content_encoding
  @opened_url.content_encoding
end

#content_typeObject

Get the pages content type

Example:

>> Debugger.new("http://wearepandr.com").content_type
=> text/html


305
306
307
# File 'lib/debugher.rb', line 305

def content_type
  @opened_url.content_type
end

#descriptionObject

Get the page description

Example:

>> Debugger.new("http://wearepandr.com").description
=> A custom Web Design Norwich and Norwich Ruby on Rails Web Development agency based in Norfolk, UK


158
159
160
161
162
# File 'lib/debugher.rb', line 158

def description
  description = page.css("meta[name='description']/@content").inner_html.strip
  description = description == '' ? nil : description
  return description
end

#feed_urlObject

Get the FEED URL, no matter if it’s the Atom URL or the RSS URL

Example:

>> Debugger.new("http://wearepandr.com").feed_url
=> http://wearepandr.com/feed


112
113
114
115
116
117
118
119
120
121
122
123
124
125
# File 'lib/debugher.rb', line 112

def feed_url
  if rss_feed_url != '' || atom_feed_url != ''
    feed_url = rss_feed_url != '' ? rss_feed_url : atom_feed_url
    
    if Debugger.relative?(feed_url)
      feed_url = Debugger.stitch_to_make_absolute(canonical_url, feed_url)
    else
      feed_url = feed_url
    end

  else
    feed_url = nil
  end
end

#fetched_urlObject

Return the fecthed URL

Example:

>> Debugger.new("rakkit.com").fetched_url
=> http://rakkit.com


54
55
56
# File 'lib/debugher.rb', line 54

def fetched_url
  @uri.to_s
end

#hostObject

Get all the links from the page

Example:

>> Debugger.new("http://wearepandr.com").host
=> wearepandr.com


296
297
298
# File 'lib/debugher.rb', line 296

def host
  Addressable::URI.parse(@uri).host  
end

Get the internal page links from the page

Example:

>> Debugger.new("http://wearepandr.com").internal_links
=> ["http://wearepandr.com/about", "http://wearepandr.com/blog"]


250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# File 'lib/debugher.rb', line 250

def internal_links
  links = []
  current_host = @uri.host

  page_links.each do |link|
    
    # Remove anchors from links

    new_link = link['href'].nil? ? nil : link['href'].split("#")[0]
    
    if !new_link.nil? && !new_link.strip.empty? && !Debugger.mailto_link?(new_link)
      
      new_link = Debugger.make_absolute(new_link)

      if new_link != nil
        
        # Check to see if the URL is still from the current site
        #
        if current_host == Addressable::URI.parse(new_link).host
          links << new_link
        end

      end
    end
  end
  links = links.uniq
  return links.compact
end

#last_modifiedObject

Get the pages last modified date

Example:

>> Debugger.new("http://wearepandr.com").last_modified
=>


332
333
334
# File 'lib/debugher.rb', line 332

def last_modified
  @opened_url.last_modified
end

#meta_dataObject

Get the page meta data in a hash, title and description.

Example:

>> Debugger.new("http://wearepandr.com").
=> {:title => "Web Design Norwich and Norwich Ruby on Rails Web Development in Norfolk |  PANDR",

:description => “A custom Web Design Norwich and Norwich Ruby on Rails Web Development agency based in Norfolk, UK”}



170
171
172
173
# File 'lib/debugher.rb', line 170

def 
  return {:title => title,
          :description => description}
end

#music_from_feed(file_types = FILE_TYPES) ⇒ Object

Get the music links from the feed found on the page

Example:

>> Debugger.new("http://wearepandr.com").music_from_feed
=> ["http://wearepandr.com/track_1.mp3", "http://wearepandr.com/track_2.mp3", "http://wearepandr.com/track_3.mp3"]

Arguments:

file_types: [Array]


183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/debugher.rb', line 183

def music_from_feed(file_types=FILE_TYPES)
  links = []
  if !feed_url.nil?
   @feed ||= Nokogiri::XML(open(feed_url))
   @feed.encoding = 'utf-8'
   channel = @feed.search('//channel')

   # If the blog isn't set up with channels then we can 
   # search the data we have for all links that end in .mp3 x
   if !channel.empty?
     items = @feed.search("//channel/item")
     items.each do |item|
       enclosures = item.search("//channel/item/enclosure")
        enclosures.each do |enclosure|
          enclosure_file = enclosure['url'].to_s[-4,4]
          links << enclosure['url'] if file_types.include?(enclosure_file)
        end
      end
    end
  end
  links = links.uniq
  return links.compact
end

#music_from_html(file_types = FILE_TYPES) ⇒ Object

Get the music links from the page html

Example:

>> Debugger.new("http://wearepandr.com").music_from_html
=> ["http://wearepandr.com/track_1.mp3", "http://wearepandr.com/track_2.mp3", "http://wearepandr.com/track_3.mp3"]

Arguments:

file_types: [Array]


215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/debugher.rb', line 215

def music_from_html(file_types=FILE_TYPES)
  links = []
  
  page_links.each do |track|
    track_file = track['href'].to_s[-4,4]
    
    if file_types.include?(track_file)
      links << track["href"]
    end
  end
  links = links.uniq
  return links.compact
end

#music_from_soundcloudObject

Get the soundcloud music links from the page html

Example:

>> Debugger.new("http://wearepandr.com").music_from_soundcloud
=> ["http://api.soundcloud.com/playlists/2153957", "http://api.soundcloud.com/playlists/2153958"]


234
235
236
237
238
239
240
241
242
243
# File 'lib/debugher.rb', line 234

def music_from_soundcloud
  links = []
  @html_url ||= Nokogiri::HTML(open(@uri))
  @html_url.search("//iframe", "//param").each do |url|
    object_url = url["src"] || url["value"]
    links << Debugger.get_soundcloud_url(object_url)
  end
  links = links.uniq
  return links.compact
end

#open_urlObject



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/debugher.rb', line 22

def open_url
  url_object = nil
  ua = Debugger.user_agent
  @robot = Robots.new(ua)
  if @robot.allowed?(@uri)
    begin
      url_object = open(@uri,
                   "User-Agent" => ua,
                   "From" => "[email protected]",
                   "Referer" => "http://rakkit.com")
    rescue Exception => e
      # Most likely a 404 error
      $stderr.puts "Unable to open url: #{url} - #{e}"
    end
  end
  return url_object
end

#pageObject

loads the Hpricot XML object if it hasn’t already been loaded



77
78
79
# File 'lib/debugher.rb', line 77

def page
  @page ||= Nokogiri::HTML(@opened_url)
end

Get all the links from the page

Example:

>> Debugger.new("http://wearepandr.com").page_links
=> ["http://wearepandr.com/about", "http://google.com", "http://yahoo.com"]


284
285
286
287
288
289
# File 'lib/debugher.rb', line 284

def page_links
  @html_url ||= Nokogiri::HTML(open(@uri))

  links = @html_url.search("//a")
  return links
end

#response_codeObject

Get the response code of the page

Example:

>> Debugger.new("http://rakkit.com").response_code
=> 200 OK


45
46
47
# File 'lib/debugher.rb', line 45

def response_code
  @opened_url.status.join(" ")
end

#rss_feed_urlObject

Get the RSS Feed URL

Example:

>> Debugger.new("http://wearepandr.com").rss_feed_url
=> http://wearepandr.com/feed


86
87
88
89
90
91
92
# File 'lib/debugher.rb', line 86

def rss_feed_url
  rss_url = page.search("link[@type='application/rss+xml']")
  rss_url = rss_url.length == 0 ? nil : rss_url.first['href']

  rss_url = Debugger.stitch_to_make_absolute(canonical_url, rss_url) if Debugger.relative?(rss_url)
  return rss_url.to_s
end

#scrape_infoObject

Return some meta info about the page

Example:

>> Debugger.new("http://wearepandr.com").scrape_info
=> {:response_code => "200 OK",
    :fetched_url => "http://wearepandr.com",
    :canonical_url => "http://wearepandr.com/",
    :feed_url => "http://wearepandr.com/feed"}


135
136
137
138
139
140
# File 'lib/debugher.rb', line 135

def scrape_info
  return {:response_code => response_code,
          :fetched_url => fetched_url,
          :canonical_url => canonical_url,
          :feed_url => feed_url}  
end

#titleObject

Get the page title

Example:

>> Debugger.new("http://wearepandr.com").title
=> Web Design Norwich and Norwich Ruby on Rails Web Development in Norfolk |  PANDR


147
148
149
150
151
# File 'lib/debugher.rb', line 147

def title
  title = page.css('title')[0].inner_html.strip
  title = title == '' ? nil : title
  return title
end