Class: Newly::PageCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/newly/page_crawler.rb

Instance Method Summary collapse

Constructor Details

#initialize(host, document) ⇒ PageCrawler

Returns a new instance of PageCrawler.



3
4
5
6
# File 'lib/newly/page_crawler.rb', line 3

def initialize(host, document)
  @host = host
  @document = document
end

Instance Method Details

#image(element) ⇒ Object



28
29
30
31
32
33
34
35
# File 'lib/newly/page_crawler.rb', line 28

def image(element)
  image = find(element, 'src')
  if (image && image.include?("==/"))
    image = "http://#{image.split("==/").last}"
  end
  image = "#{@host}/#{image}".gsub('../', '') if image && image.include?('../')
  image
end


22
23
24
25
26
# File 'lib/newly/page_crawler.rb', line 22

def link(element)
  href = find(element, 'href')
  href = "#{@host}/#{href}".gsub('../', '') if href && !href.include?('http')
  href
end

#text(element) ⇒ Object



15
16
17
18
19
20
# File 'lib/newly/page_crawler.rb', line 15

def text(element)
  if valid?(element)
    text = get(element).text
    text if valid?(text)
  end
end

#titleize(element) ⇒ Object



8
9
10
11
12
13
# File 'lib/newly/page_crawler.rb', line 8

def titleize(element)
  title = text(element)
  title[0] = title.capitalize[0] if title

  title
end