Class: Webtractor::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/webtractor/extractor.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(params = {}) ⇒ Extractor

Returns a new instance of Extractor.



5
6
7
8
9
10
# File 'lib/webtractor/extractor.rb', line 5

def initialize params={}
  @agent = params[:agent] || Mechanize.new
  @filters = params[:filters] || [Filters::DefaultFilter.new]
  @cache = params[:cache] || false
  @cache_params = params[:cache_params] || {}
end

Instance Attribute Details

#filtersObject

Returns the value of attribute filters.



3
4
5
# File 'lib/webtractor/extractor.rb', line 3

def filters
  @filters
end

Instance Method Details

#add_filter(filter) ⇒ Object



32
33
34
35
36
37
38
# File 'lib/webtractor/extractor.rb', line 32

def add_filter filter
  if filter.is_a?(Class)
    @filters << filter.new
  else
    @filters << filter
  end
end

#clear_filtersObject



45
46
47
# File 'lib/webtractor/extractor.rb', line 45

def clear_filters
  @filters.clear
end

#extract(text) ⇒ Object



12
13
14
# File 'lib/webtractor/extractor.rb', line 12

def extract text
  extract_from_xml(Nokogiri::HTML(text))
end

#extract_from_url(url) ⇒ Object



25
26
27
28
29
30
# File 'lib/webtractor/extractor.rb', line 25

def extract_from_url url
  content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do
    @agent.get(url).content
  end
  extract(content)
end

#extract_from_xml(page) ⇒ Object



16
17
18
19
20
21
22
23
# File 'lib/webtractor/extractor.rb', line 16

def extract_from_xml page
  title = page.xpath('//head/title').text
  body = page.at('body')
  @filters.each do |filter|
    body = filter.process(body)
  end
  Result.new(title, body)
end

#remove_filter(filter) ⇒ Object



40
41
42
43
# File 'lib/webtractor/extractor.rb', line 40

def remove_filter filter
  filter = filter.class unless filter.is_a?(Class)
  @filters = @filters.reject!{|f| f.is_a?(filter)}
end