Class: Wgit::Url

Inherits:

String

Object
String
Wgit::Url

show all

Defined in:: lib/wgit/url.rb

Overview

Class modeling a web based URL. Can be an internal link e.g. “about.html” or a full URL e.g. “www.google.co.uk”.

Author:

Michael Telford

Instance Attribute Summary collapse

#crawled ⇒ Object (also: #crawled?)

Returns the value of attribute crawled.
#date_crawled ⇒ Object

Returns the value of attribute date_crawled.

Class Method Summary collapse

.concat(host, link) ⇒ Object
.prefix_protocol(url, https = false) ⇒ Object

Modifies the receiver url by prefixing it with a protocol.
.relative_link?(link) ⇒ Boolean

URI.split(“www.google.co.uk/about.html”) returns the following: array: “www.google.co.uk”, array: “/about.html”.
.valid?(url) ⇒ Boolean
.validate(url) ⇒ Object

Instance Method Summary collapse

#concat(link) ⇒ Object
#initialize(url_or_doc, crawled = false, date_crawled = nil) ⇒ Url constructor

A new instance of Url.
#relative_link? ⇒ Boolean (also: #internal_link?)
#to_base ⇒ Object (also: #base)

URI.split(“www.google.co.uk/about.html”) returns the following: array: “http://”, array: “www.google.co.uk”.
#to_h ⇒ Object (also: #to_hash)
#to_host ⇒ Object (also: #host)

Given www.google.co.uk/about.html, www.google.co.uk is returned.
#to_uri ⇒ Object
#to_url ⇒ Object
#valid? ⇒ Boolean

Constructor Details

#initialize(url_or_doc, crawled = false, date_crawled = nil) ⇒ `Url`

Returns a new instance of Url.

# File 'lib/wgit/url.rb', line 13

def initialize(url_or_doc, crawled = false, date_crawled = nil)
    if (url_or_doc.is_a?(String))
        url = url_or_doc
    else
        # Init from a mongo collection document.

        url = url_or_doc[:url]
        crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
        date_crawled = url_or_doc[:date_crawled]
    end
    @uri = URI(url)
    @crawled = crawled
    @date_crawled = date_crawled
    super(url)
end

Instance Attribute Details

#crawled ⇒ `Object` Also known as: crawled?

Returns the value of attribute crawled.



11
12
13

# File 'lib/wgit/url.rb', line 11

def crawled
  @crawled
end

#date_crawled ⇒ `Object`

Returns the value of attribute date_crawled.



11
12
13

# File 'lib/wgit/url.rb', line 11

def date_crawled
  @date_crawled
end

Class Method Details

.concat(host, link) ⇒ `Object`

# File 'lib/wgit/url.rb', line 76

def self.concat(host, link)
    url = host
    url.chop! if url.end_with?("/")
    link = link[1..-1] if link.start_with?("/")
    Wgit::Url.new(url + "/" + link)
end

.prefix_protocol(url, https = false) ⇒ `Object`

Modifies the receiver url by prefixing it with a protocol. Returns the url whether its been modified or not.

# File 'lib/wgit/url.rb', line 49

def self.prefix_protocol(url, https = false)
    unless url.start_with?("http://") or url.start_with?("https://")
        if https
            url.replace("https://#{url}")
        else
            url.replace("http://#{url}")
        end
    end
    url
end

.relative_link?(link) ⇒ `Boolean`

URI.split(“www.google.co.uk/about.html”) returns the following: array: “www.google.co.uk”, array: “/about.html”. This means that all external links in a page are expected to have a protocol prefix e.g. “http://”, otherwise the link is treated as an internal link (regardless of whether it is valid or not).

Returns:

(Boolean)

# File 'lib/wgit/url.rb', line 65

def self.relative_link?(link)
    link_segs = URI.split(link)
    if not link_segs[2].nil? and not link_segs[2].empty?
        false
    elsif not link_segs[5].nil? and not link_segs[5].empty?
        true
    else
        raise "Invalid link: #{link}"
    end
end

.valid?(url) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/wgit/url.rb', line 40

def self.valid?(url)
    Wgit::Url.validate(url)
    true
rescue
    false
end

.validate(url) ⇒ `Object`

# File 'lib/wgit/url.rb', line 28

def self.validate(url)
    if Wgit::Url.relative_link?(url)
        raise "Invalid url (or a relative link): #{url}"
    end
    unless url.start_with?("http://") or url.start_with?("https://")
        raise "Invalid url (missing protocol prefix): #{url}"
    end
    if URI.regexp.match(url).nil?
        raise "Invalid url: #{url}"
    end
end

Instance Method Details

#concat(link) ⇒ `Object`



91
92
93

# File 'lib/wgit/url.rb', line 91

def concat(link)
    Wgit::Url.concat(self, link)
end

#relative_link? ⇒ `Boolean` Also known as: internal_link?

Returns:

(Boolean)



83
84
85

# File 'lib/wgit/url.rb', line 83

def relative_link?
    Wgit::Url.relative_link?(self)
end

#to_base ⇒ `Object` Also known as: base

URI.split(“www.google.co.uk/about.html”) returns the following: array: “http://”, array: “www.google.co.uk”. Returns array + array e.g. www.google.co.uk.

# File 'lib/wgit/url.rb', line 116

def to_base
    if Wgit::Url.relative_link?(self)
        raise "A relative link doesn't have a base URL: #{self}"
    end
    url_segs = URI.split(self)
    if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
        raise "Both a protocol and host are needed: #{self}"
    end
    base = "#{url_segs[0]}://#{url_segs[2]}"
    Wgit::Url.new(base)
end

#to_h ⇒ `Object` Also known as: to_hash

# File 'lib/wgit/url.rb', line 128

def to_h
    ignore = [:@uri]
    h = Wgit::Utils.to_h(self, ignore)
    Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.

end

#to_host ⇒ `Object` Also known as: host

Given www.google.co.uk/about.html, www.google.co.uk is returned.



109
110
111

# File 'lib/wgit/url.rb', line 109

def to_host
    Wgit::Url.new(@uri.host)
end

#to_uri ⇒ `Object`



100
101
102

# File 'lib/wgit/url.rb', line 100

def to_uri
    @uri
end

#to_url ⇒ `Object`



104
105
106

# File 'lib/wgit/url.rb', line 104

def to_url
  self
end

#valid? ⇒ `Boolean`

Returns:

(Boolean)



87
88
89

# File 'lib/wgit/url.rb', line 87

def valid?
    Wgit::Url.valid?(self)
end

Class: Wgit::Url

Overview

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url_or_doc, crawled = false, date_crawled = nil) ⇒ Url

Instance Attribute Details

#crawled ⇒ Object Also known as: crawled?

#date_crawled ⇒ Object

Class Method Details

.concat(host, link) ⇒ Object

.prefix_protocol(url, https = false) ⇒ Object

.relative_link?(link) ⇒ Boolean

.valid?(url) ⇒ Boolean

.validate(url) ⇒ Object

Instance Method Details

#concat(link) ⇒ Object

#relative_link? ⇒ Boolean Also known as: internal_link?

#to_base ⇒ Object Also known as: base

#to_h ⇒ Object Also known as: to_hash

#to_host ⇒ Object Also known as: host

#to_uri ⇒ Object

#to_url ⇒ Object

#valid? ⇒ Boolean

#initialize(url_or_doc, crawled = false, date_crawled = nil) ⇒ `Url`

#crawled ⇒ `Object` Also known as: crawled?

#date_crawled ⇒ `Object`

.concat(host, link) ⇒ `Object`

.prefix_protocol(url, https = false) ⇒ `Object`

.relative_link?(link) ⇒ `Boolean`

.valid?(url) ⇒ `Boolean`

.validate(url) ⇒ `Object`

#concat(link) ⇒ `Object`

#relative_link? ⇒ `Boolean` Also known as: internal_link?

#to_base ⇒ `Object` Also known as: base

#to_h ⇒ `Object` Also known as: to_hash

#to_host ⇒ `Object` Also known as: host

#to_uri ⇒ `Object`

#to_url ⇒ `Object`

#valid? ⇒ `Boolean`