Class: Extractula::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/extractula/extractor.rb

Overview

Abstract (more or less) extractor class from which custom extractor classes should descend. Subclasses of Extractula::Extractor will be automatically added to the Extracula module.

Direct Known Subclasses

DinosaurComics, Flickr, TwitPic, Vimeo, YFrog, YouTube

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, html) ⇒ Extractor

Returns a new instance of Extractor.



64
65
66
67
# File 'lib/extractula/extractor.rb', line 64

def initialize url, html
  @url  = url.is_a?(Domainatrix::Url) ? url : Domainatrix.parse(url)
  @html = html.is_a?(Nokogiri::HTML::Document) ? html : Nokogiri::HTML(html)
end

Instance Attribute Details

#htmlObject (readonly)

Returns the value of attribute html.



62
63
64
# File 'lib/extractula/extractor.rb', line 62

def html
  @html
end

#urlObject (readonly)

Returns the value of attribute url.



62
63
64
# File 'lib/extractula/extractor.rb', line 62

def url
  @url
end

Class Method Details

.can_extract?(url, html) ⇒ Boolean

Returns:

  • (Boolean)


14
15
16
17
18
19
20
# File 'lib/extractula/extractor.rb', line 14

def self.can_extract? url, html
  if @extractable_domain.is_a? Regexp
    url.host + url.path =~ @extractable_domain
  else
    @extractable_domain ? @extractable_domain == url.domain : false
  end
end

.domain(domain) ⇒ Object



10
11
12
# File 'lib/extractula/extractor.rb', line 10

def self.domain domain
  @extractable_domain = domain
end

.inherited(subclass) ⇒ Object



6
7
8
# File 'lib/extractula/extractor.rb', line 6

def self.inherited subclass
  Extractula.add_extractor subclass
end

.media_type(type = nil) ⇒ Object



22
23
24
25
# File 'lib/extractula/extractor.rb', line 22

def self.media_type type = nil
  @media_type = type if type
  @media_type
end

Instance Method Details

#contentObject



89
90
91
# File 'lib/extractula/extractor.rb', line 89

def content
  content_at(content_path, content_attr, content_block) || extract_content
end

#extractObject



69
70
71
72
73
74
75
76
77
78
79
# File 'lib/extractula/extractor.rb', line 69

def extract
  Extractula::ExtractedContent.new({
    :url          => url.url,
    :media_type   => media_type,
    :title        => title,
    :content      => content,
    :summary      => summary,
    :image_urls   => image_urls,
    :video_embed  => video_embed
  })
end

#image_urlsObject



97
98
99
100
101
# File 'lib/extractula/extractor.rb', line 97

def image_urls
  if image_urls_path
    image_srcs_from html.search(image_urls_path)
  end
end

#media_typeObject



81
82
83
# File 'lib/extractula/extractor.rb', line 81

def media_type
  self.class.media_type || 'text'
end

#summaryObject



93
94
95
# File 'lib/extractula/extractor.rb', line 93

def summary
  content_at(summary_path, summary_attr, summary_block)
end

#titleObject



85
86
87
# File 'lib/extractula/extractor.rb', line 85

def title
  content_at(title_path, title_attr, title_block) || content_at("//title")
end

#video_embedObject



103
104
105
106
107
# File 'lib/extractula/extractor.rb', line 103

def video_embed
  if video_embed_path
    embed_code_from html.search(video_embed_path)
  end
end