Class: Webtractor::Extractor
- Inherits:
-
Object
- Object
- Webtractor::Extractor
- Defined in:
- lib/webtractor/extractor.rb
Instance Attribute Summary collapse
-
#filters ⇒ Object
Returns the value of attribute filters.
Instance Method Summary collapse
- #add_filter(filter) ⇒ Object
- #clear_filters ⇒ Object
- #extract(text) ⇒ Object
- #extract_from_url(url) ⇒ Object
- #extract_from_xml(page) ⇒ Object
-
#initialize(params = {}) ⇒ Extractor
constructor
A new instance of Extractor.
- #remove_filter(filter) ⇒ Object
Constructor Details
#initialize(params = {}) ⇒ Extractor
Returns a new instance of Extractor.
5 6 7 8 9 10 |
# File 'lib/webtractor/extractor.rb', line 5 def initialize params={} @agent = params[:agent] || Mechanize.new @filters = params[:filters] || [Filters::DefaultFilter.new] @cache = params[:cache] || false @cache_params = params[:cache_params] || {} end |
Instance Attribute Details
#filters ⇒ Object
Returns the value of attribute filters.
3 4 5 |
# File 'lib/webtractor/extractor.rb', line 3 def filters @filters end |
Instance Method Details
#add_filter(filter) ⇒ Object
32 33 34 35 36 37 38 |
# File 'lib/webtractor/extractor.rb', line 32 def add_filter filter if filter.is_a?(Class) @filters << filter.new else @filters << filter end end |
#clear_filters ⇒ Object
45 46 47 |
# File 'lib/webtractor/extractor.rb', line 45 def clear_filters @filters.clear end |
#extract(text) ⇒ Object
12 13 14 |
# File 'lib/webtractor/extractor.rb', line 12 def extract text extract_from_xml(Nokogiri::HTML(text)) end |
#extract_from_url(url) ⇒ Object
25 26 27 28 29 30 |
# File 'lib/webtractor/extractor.rb', line 25 def extract_from_url url content = Cachy.cache_if(@cache, "webtractor.#{url}", @cache_params) do @agent.get(url).content end extract(content) end |
#extract_from_xml(page) ⇒ Object
16 17 18 19 20 21 22 23 |
# File 'lib/webtractor/extractor.rb', line 16 def extract_from_xml page title = page.xpath('//head/title').text body = page.at('body') @filters.each do |filter| body = filter.process(body) end Result.new(title, body) end |
#remove_filter(filter) ⇒ Object
40 41 42 43 |
# File 'lib/webtractor/extractor.rb', line 40 def remove_filter filter filter = filter.class unless filter.is_a?(Class) @filters = @filters.reject!{|f| f.is_a?(filter)} end |