Class: Retriever::Page
- Inherits:
-
Object
- Object
- Retriever::Page
- Defined in:
- lib/retriever/page.rb
Constant Summary collapse
- HTTP_RE =
Regexp.new(/^http/i).freeze
- H1_RE =
Regexp.new(/<h1>(.*)<\/h1>/i).freeze
- H2_RE =
Regexp.new(/<h2>(.*)<\/h2>/i).freeze
- TITLE_RE =
Regexp.new(/<title>(.*)<\/title>/i).freeze
- DESC_RE =
Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'] [^>]*content=[\"] ( [^\"]* ) [\"] [^>] *> /ix).freeze
- HREF_CONTENTS_RE =
Regexp.new(/\shref= ['|"] ( [^\s] [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+ ) ['|"] [\s|\W] /ix).freeze
- NONPAGE_EXT_RE =
Regexp.new(/\. (?:css|js|png|gif|jpg|mp4| wmv|flv|mp3|wav|doc|txt|ico|xml) /ix).freeze
Instance Attribute Summary collapse
-
#links ⇒ Object
readonly
recieves page source as string returns array of unique href links.
-
#source ⇒ Object
readonly
Returns the value of attribute source.
-
#t ⇒ Object
readonly
Returns the value of attribute t.
Instance Method Summary collapse
- #desc ⇒ Object
- #h1 ⇒ Object
- #h2 ⇒ Object
-
#initialize(source, t) ⇒ Page
constructor
A new instance of Page.
- #parse_files(arr) ⇒ Object
- #parse_internal ⇒ Object
- #parse_internal_visitable ⇒ Object
- #parse_seo ⇒ Object
- #title ⇒ Object
Constructor Details
#initialize(source, t) ⇒ Page
Returns a new instance of Page.
35 36 37 38 39 |
# File 'lib/retriever/page.rb', line 35 def initialize(source, t) @t = t @source = source.encode('UTF-8', invalid: :replace, undef: :replace) @links = nil end |
Instance Attribute Details
#links ⇒ Object (readonly)
recieves page source as string returns array of unique href links
43 44 45 |
# File 'lib/retriever/page.rb', line 43 def links @links end |
#source ⇒ Object (readonly)
Returns the value of attribute source.
33 34 35 |
# File 'lib/retriever/page.rb', line 33 def source @source end |
#t ⇒ Object (readonly)
Returns the value of attribute t.
33 34 35 |
# File 'lib/retriever/page.rb', line 33 def t @t end |
Instance Method Details
#desc ⇒ Object
70 71 72 |
# File 'lib/retriever/page.rb', line 70 def desc DESC_RE =~ @source ? @source.match(DESC_RE)[1] : '' end |
#h1 ⇒ Object
74 75 76 |
# File 'lib/retriever/page.rb', line 74 def h1 H1_RE =~ @source ? @source.match(H1_RE)[1] : '' end |
#h2 ⇒ Object
78 79 80 |
# File 'lib/retriever/page.rb', line 78 def h2 H2_RE =~ @source ? @source.match(H2_RE)[1] : '' end |
#parse_files(arr) ⇒ Object
62 63 64 |
# File 'lib/retriever/page.rb', line 62 def parse_files(arr) arr.select { |x| @t.file_re =~ x } end |
#parse_internal ⇒ Object
54 55 56 |
# File 'lib/retriever/page.rb', line 54 def parse_internal links.select { |x| @t.host == Addressable::URI.parse(x).host } end |
#parse_internal_visitable ⇒ Object
58 59 60 |
# File 'lib/retriever/page.rb', line 58 def parse_internal_visitable parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) } end |
#parse_seo ⇒ Object
82 83 84 |
# File 'lib/retriever/page.rb', line 82 def parse_seo [title, desc, h1, h2] end |