Class: Wgit::Url

Inherits:
String show all
Defined in:
lib/wgit/url.rb

Overview

Class modeling a web based URL. Can be an internal link e.g. “about.html” or a full URL e.g. “www.google.co.uk”.

Author:

  • Michael Telford

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url_or_doc, crawled = false, date_crawled = nil) ⇒ Url

Returns a new instance of Url.



13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/wgit/url.rb', line 13

def initialize(url_or_doc, crawled = false, date_crawled = nil)
    if (url_or_doc.is_a?(String))
        url = url_or_doc
    else
        # Init from a mongo collection document.

        url = url_or_doc[:url]
        crawled = url_or_doc[:crawled].nil? ? false : url_or_doc[:crawled]
        date_crawled = url_or_doc[:date_crawled]
    end
    @uri = URI(url)
    @crawled = crawled
    @date_crawled = date_crawled
    super(url)
end

Instance Attribute Details

#crawledObject Also known as: crawled?

Returns the value of attribute crawled.



11
12
13
# File 'lib/wgit/url.rb', line 11

def crawled
  @crawled
end

#date_crawledObject

Returns the value of attribute date_crawled.



11
12
13
# File 'lib/wgit/url.rb', line 11

def date_crawled
  @date_crawled
end

Class Method Details

.concat(host, link) ⇒ Object



76
77
78
79
80
81
# File 'lib/wgit/url.rb', line 76

def self.concat(host, link)
    url = host
    url.chop! if url.end_with?("/")
    link = link[1..-1] if link.start_with?("/")
    Wgit::Url.new(url + "/" + link)
end

.prefix_protocol(url, https = false) ⇒ Object

Modifies the receiver url by prefixing it with a protocol. Returns the url whether its been modified or not.



49
50
51
52
53
54
55
56
57
58
# File 'lib/wgit/url.rb', line 49

def self.prefix_protocol(url, https = false)
    unless url.start_with?("http://") or url.start_with?("https://")
        if https
            url.replace("https://#{url}")
        else
            url.replace("http://#{url}")
        end
    end
    url
end

.relative_link?(link) ⇒ Boolean

URI.split(“www.google.co.uk/about.html”) returns the following: array: “www.google.co.uk”, array: “/about.html”. This means that all external links in a page are expected to have a protocol prefix e.g. “http://”, otherwise the link is treated as an internal link (regardless of whether it is valid or not).

Returns:

  • (Boolean)


65
66
67
68
69
70
71
72
73
74
# File 'lib/wgit/url.rb', line 65

def self.relative_link?(link)
    link_segs = URI.split(link)
    if not link_segs[2].nil? and not link_segs[2].empty?
        false
    elsif not link_segs[5].nil? and not link_segs[5].empty?
        true
    else
        raise "Invalid link: #{link}"
    end
end

.valid?(url) ⇒ Boolean

Returns:

  • (Boolean)


40
41
42
43
44
45
# File 'lib/wgit/url.rb', line 40

def self.valid?(url)
    Wgit::Url.validate(url)
    true
rescue
    false
end

.validate(url) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
# File 'lib/wgit/url.rb', line 28

def self.validate(url)
    if Wgit::Url.relative_link?(url)
        raise "Invalid url (or a relative link): #{url}"
    end
    unless url.start_with?("http://") or url.start_with?("https://")
        raise "Invalid url (missing protocol prefix): #{url}"
    end
    if URI.regexp.match(url).nil?
        raise "Invalid url: #{url}"
    end
end

Instance Method Details

#concat(link) ⇒ Object



91
92
93
# File 'lib/wgit/url.rb', line 91

def concat(link)
    Wgit::Url.concat(self, link)
end

#relative_link?Boolean Also known as: internal_link?

Returns:

  • (Boolean)


83
84
85
# File 'lib/wgit/url.rb', line 83

def relative_link?
    Wgit::Url.relative_link?(self)
end

#to_baseObject Also known as: base

URI.split(“www.google.co.uk/about.html”) returns the following: array: “http://”, array: “www.google.co.uk”. Returns array + array e.g. www.google.co.uk.



116
117
118
119
120
121
122
123
124
125
126
# File 'lib/wgit/url.rb', line 116

def to_base
    if Wgit::Url.relative_link?(self)
        raise "A relative link doesn't have a base URL: #{self}"
    end
    url_segs = URI.split(self)
    if url_segs[0].nil? or url_segs[2].nil? or url_segs[2].empty?
        raise "Both a protocol and host are needed: #{self}"
    end
    base = "#{url_segs[0]}://#{url_segs[2]}"
    Wgit::Url.new(base)
end

#to_hObject Also known as: to_hash



128
129
130
131
132
# File 'lib/wgit/url.rb', line 128

def to_h
    ignore = [:@uri]
    h = Wgit::Utils.to_h(self, ignore)
    Hash[h.to_a.insert(0, [:url, self])] # Insert url at position 0.

end

#to_hostObject Also known as: host



109
110
111
# File 'lib/wgit/url.rb', line 109

def to_host
    Wgit::Url.new(@uri.host)
end

#to_uriObject



100
101
102
# File 'lib/wgit/url.rb', line 100

def to_uri
    @uri
end

#to_urlObject



104
105
106
# File 'lib/wgit/url.rb', line 104

def to_url
  self
end

#valid?Boolean

Returns:

  • (Boolean)


87
88
89
# File 'lib/wgit/url.rb', line 87

def valid?
    Wgit::Url.valid?(self)
end