Class: Web2Text::Crawl

Inherits:
Object
  • Object
show all
Defined in:
lib/web2text/crawl.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, avoid = [], focus = []) ⇒ Crawl

Returns a new instance of Crawl.



6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/web2text/crawl.rb', line 6

def initialize(url, avoid = [], focus = [])
  @url = url

  @avoid = avoid.map { |a|
    a = URI::join(url, a) if !a.start_with? url
    a.to_s
  }

  @focus = focus.map { |a|
    a = URI::join(url, a) if !a.start_with? url
    a.to_s
  }
end

Instance Attribute Details

#urlObject (readonly)

Returns the value of attribute url.



4
5
6
# File 'lib/web2text/crawl.rb', line 4

def url
  @url
end

Instance Method Details

#filter(urls) ⇒ Object



20
21
22
# File 'lib/web2text/crawl.rb', line 20

def filter(urls)
  urls.reject {|u| self.skip? u}
end

#focus?(url) ⇒ Boolean

Returns:

  • (Boolean)


35
36
37
38
39
40
41
42
43
44
# File 'lib/web2text/crawl.rb', line 35

def focus?(url)
  if @focus.empty?
    true
  else
    url_s = url.to_s
    @focus.any? { |a|
      url_s.start_with? a
    }
  end
end

#skip?(url) ⇒ Boolean

Returns:

  • (Boolean)


24
25
26
27
28
29
30
31
32
33
# File 'lib/web2text/crawl.rb', line 24

def skip?(url)
  url_s = url.to_s
  if !url_s.start_with? @url
    return true
  end

  @avoid.any? { |a|
    url_s.start_with? a
  }
end