Class: Webtractor::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/webtractor/extractor.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(params = {}) ⇒ Extractor

Returns a new instance of Extractor.



5
6
7
8
9
# File 'lib/webtractor/extractor.rb', line 5

def initialize params={}
  @filters = params[:filters] || [Filters::DefaultFilter.new]
  @cache = params[:cache] || false
  @cache_params = params[:cache_params] || {}
end

Instance Attribute Details

#filtersObject

Returns the value of attribute filters.



3
4
5
# File 'lib/webtractor/extractor.rb', line 3

def filters
  @filters
end

Instance Method Details

#add_filter(filter) ⇒ Object



30
31
32
33
34
35
36
# File 'lib/webtractor/extractor.rb', line 30

def add_filter filter
  if filter.is_a?(Class)
    @filters << filter.new
  else
    @filters << filter
  end
end

#clear_filtersObject



43
44
45
# File 'lib/webtractor/extractor.rb', line 43

def clear_filters
  @filters.clear
end

#extract(text) ⇒ Object



11
12
13
# File 'lib/webtractor/extractor.rb', line 11

def extract text
  extract_from_xml(Nokogiri::HTML(text))
end

#extract_from_url(url) ⇒ Object



23
24
25
26
27
28
# File 'lib/webtractor/extractor.rb', line 23

def extract_from_url url
  content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do
    open(url).read
  end
  extract(content)
end

#extract_from_xml(page) ⇒ Object



15
16
17
18
19
20
21
# File 'lib/webtractor/extractor.rb', line 15

def extract_from_xml page
  title = page.xpath('//head/title').text
  @filters.each do |filter|
    page = filter.process(page)
  end
  Result.new(title, page)
end

#remove_filter(filter) ⇒ Object



38
39
40
41
# File 'lib/webtractor/extractor.rb', line 38

def remove_filter filter
  filter = filter.class unless filter.is_a?(Class)
  @filters = @filters.reject!{|f| f.is_a?(filter)}
end