Class: Retriever::Page
- Inherits:
-
Object
- Object
- Retriever::Page
- Defined in:
- lib/retriever/page.rb
Constant Summary collapse
- HTTP_RE =
Regexp.new(/^http/i).freeze
- H1_RE =
Regexp.new(/<h1>(.*)<\/h1>/i).freeze
- H2_RE =
Regexp.new(/<h2>(.*)<\/h2>/i).freeze
- TITLE_RE =
Regexp.new(/<title>(.*)<\/title>/i).freeze
- DESC_RE =
Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'] [^>]*content=[\"] ( [^\"]* ) [\"] [^>] *> /ix).freeze
- HREF_CONTENTS_RE =
Regexp.new(/\shref= ['|"] ( [^\s] [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+ ) ['|"] [\s|\W] /ix).freeze
- NONPAGE_EXT_RE =
Regexp.new(/\. (?:css|js|png|gif|jpg|mp4| wmv|flv|mp3|wav|doc|txt|ico|xml) /ix).freeze
Instance Attribute Summary collapse
-
#links ⇒ Object
readonly
recieves page source as string returns array of unique href links.
-
#source ⇒ Object
readonly
Returns the value of attribute source.
-
#t ⇒ Object
readonly
Returns the value of attribute t.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #desc ⇒ Object
- #h1 ⇒ Object
- #h2 ⇒ Object
-
#initialize(url, source, t) ⇒ Page
constructor
A new instance of Page.
- #parse_files(arr = parse_internal) ⇒ Object
- #parse_internal ⇒ Object
- #parse_internal_visitable ⇒ Object
- #parse_seo ⇒ Object
- #title ⇒ Object
Constructor Details
#initialize(url, source, t) ⇒ Page
Returns a new instance of Page.
36 37 38 39 40 41 |
# File 'lib/retriever/page.rb', line 36 def initialize(url, source, t) @url = url @t = t @source = source.encode_utf8_and_replace @links = nil end |
Instance Attribute Details
#links ⇒ Object (readonly)
recieves page source as string returns array of unique href links
45 46 47 |
# File 'lib/retriever/page.rb', line 45 def links @links end |
#source ⇒ Object (readonly)
Returns the value of attribute source.
34 35 36 |
# File 'lib/retriever/page.rb', line 34 def source @source end |
#t ⇒ Object (readonly)
Returns the value of attribute t.
34 35 36 |
# File 'lib/retriever/page.rb', line 34 def t @t end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
34 35 36 |
# File 'lib/retriever/page.rb', line 34 def url @url end |
Instance Method Details
#desc ⇒ Object
72 73 74 |
# File 'lib/retriever/page.rb', line 72 def desc DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : '' end |
#h1 ⇒ Object
76 77 78 |
# File 'lib/retriever/page.rb', line 76 def h1 H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : '' end |
#h2 ⇒ Object
80 81 82 |
# File 'lib/retriever/page.rb', line 80 def h2 H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : '' end |
#parse_files(arr = parse_internal) ⇒ Object
64 65 66 |
# File 'lib/retriever/page.rb', line 64 def parse_files(arr = parse_internal) arr.select { |x| @t.file_re =~ x } end |
#parse_internal ⇒ Object
56 57 58 |
# File 'lib/retriever/page.rb', line 56 def parse_internal links.select { |x| @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host } end |
#parse_internal_visitable ⇒ Object
60 61 62 |
# File 'lib/retriever/page.rb', line 60 def parse_internal_visitable parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) } end |
#parse_seo ⇒ Object
84 85 86 |
# File 'lib/retriever/page.rb', line 84 def parse_seo [title, desc, h1, h2] end |