Class: StaticImageDownloader::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/static_image_download/parser.rb

Constant Summary collapse

PARSER_OPTIONS =
{
  'URI_EXTRACT'    =>  :img_parse_uri_extract,
  'NOKOGIRI'     =>  :img_parse_nokogiri,
  'HPRICOT'      => :img_parse_hpricot
}
@@DEFAULTPARSEOPTION =

also you can use one ‘NOKOGIRI’ or ‘HPRICOT’

'URI_EXTRACT'
@@DEFAULTUSERAGENT =
'Mozilla/5.0'
@@DEFAULTPATH =
"./"
@@DEFAULTSITE =
'http://feed.informer.com'
@@DEFAULTTIMEOUT =
15

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url = @@DEFAULTSITE, path = @@DEFAULTPATH, parse_option = @@DEFAULTPARSEOPTION, timeout = @@DEFAULTTIMEOUT, user_agent = @@DEFAULTUSERAGENT, h = {}) ⇒ Parser

Returns a new instance of Parser.



21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/static_image_download/parser.rb', line 21

def initialize(url=@@DEFAULTSITE, path=@@DEFAULTPATH, parse_option=@@DEFAULTPARSEOPTION, timeout=@@DEFAULTTIMEOUT, user_agent=@@DEFAULTUSERAGENT, h={})
  @url         = url.nil? ? @@DEFAULTSITE : url
  @user_agent    = user_agent.nil? ? @@DEFAULTUSERAGENT : user_agent
  @path        = path.nil? ? @@DEFAULTPATH : path
  @timeout       = timeout.nil? ? @@DEFAULTTIMEOUT : timeout
  @parse_option    = parse_option.nil? ? @@DEFAULTPARSEOPTION : parse_option
  @images      = []
  @extracted_links   = []
  @rgxp_img_uri    = Regexp.new(/^(http|https|ftp)\:\/\/([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)?((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.[a-zA-Z]{2,4})(\:[0-9]+)?(\/[^\/][a-zA-Z0-9\.\,\?\'\\\/\+&%\$#\=~_\-@]*)\.(#{Images::IMAGE_EXT.join('|')})/i)
  #@rgxp_img_uri     = Regexp.new(/^(((http|https|ftp)\:\/\/)|www|(\/\/))([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)?((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.[a-zA-Z]{2,4})(\:[0-9]+)?(\/[^\/][a-zA-Z0-9\.\,\?\'\\\/\+&%\$#\=~_\-@]*)\.(#{Images::IMAGE_EXT.join('|')})/i)
  @domain      = URI.parse(url).host
  @content     = nil
end

Instance Attribute Details

#contentObject

Returns the value of attribute content.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def content
  @content
end

Returns the value of attribute extracted_links.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def extracted_links
  @extracted_links
end

#imagesObject

Returns the value of attribute images.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def images
  @images
end

#parse_optionObject

Returns the value of attribute parse_option.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def parse_option
  @parse_option
end

#urlObject

Returns the value of attribute url.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def url
  @url
end

#user_agentObject

Returns the value of attribute user_agent.



6
7
8
# File 'lib/static_image_download/parser.rb', line 6

def user_agent
  @user_agent
end

Class Method Details

.default_parse_optionObject



36
37
38
# File 'lib/static_image_download/parser.rb', line 36

def default_parse_option
  @@DEFAULTPARSEOPTION
end

.default_pathObject



44
45
46
# File 'lib/static_image_download/parser.rb', line 44

def default_path
  @@DEFAULTPATH
end

.default_timeoutObject



48
49
50
# File 'lib/static_image_download/parser.rb', line 48

def default_timeout
  @@DEFAULTTIMEOUT
end

.default_user_agentObject



40
41
42
# File 'lib/static_image_download/parser.rb', line 40

def default_user_agent
  @@DEFAULTUSERAGENT
end

Instance Method Details

#collect_imagesObject



116
117
118
119
120
# File 'lib/static_image_download/parser.rb', line 116

def collect_images
  @extracted_links.each do |link|
    self.push_image(link)
  end
end

#get_content_rawObject



69
70
71
72
73
# File 'lib/static_image_download/parser.rb', line 69

def get_content_raw
  @content = self.get_url.read
  @content.gsub!(/[\n\r\t]+/,' ')
  #p @content if $debug_option
end


93
94
95
96
97
98
99
100
101
# File 'lib/static_image_download/parser.rb', line 93

def get_extracted_links(links)
  return false unless links 
  links.each do |link|
    p "link= #{link}" if $debug_option
    link = link[:src].to_s unless link.is_a?(String)
    @extracted_links << link.match(@rgxp_img_uri)[0] if link.match(@rgxp_img_uri) and !@extracted_links.include?(link.match(@rgxp_img_uri)[0])
  end
  #p "extracted_links= #{@extracted_links}" if $debug_option
end

#get_urlObject



75
76
77
# File 'lib/static_image_download/parser.rb', line 75

def get_url
  open(self.url, 'User-Agent' => self.user_agent)
end

#img_parse_hpricot(h = {}) ⇒ Object



84
85
86
87
# File 'lib/static_image_download/parser.rb', line 84

def img_parse_hpricot(h={})
  doc = Hpricot(@content)
  get_extracted_links(doc.search("//img"))
end

#img_parse_nokogiri(h = {}) ⇒ Object



79
80
81
82
# File 'lib/static_image_download/parser.rb', line 79

def img_parse_nokogiri(h={})
  doc = Nokogiri::HTML(@content)
  get_extracted_links(doc.search("//img"))
end

#img_parse_uri_extract(h = {}) ⇒ Object



89
90
91
# File 'lib/static_image_download/parser.rb', line 89

def img_parse_uri_extract(h={})
  get_extracted_links(URI.extract(@content).select{ |l| l[/#{@rgxp_img_uri}/] })
end

#method_to_value(option, h = {}) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
# File 'lib/static_image_download/parser.rb', line 57

def method_to_value(option, h={})
  method = option_to_method(option)
  p "method= #{method}" if $debug_option
  begin
    response = send(method, h) || ""
    return response
  rescue => error
    p "method_to_value.error = #{error}"
    nil
  end
end

#option_to_method(option) ⇒ Object



53
54
55
# File 'lib/static_image_download/parser.rb', line 53

def option_to_method(option)
  opt = PARSER_OPTIONS[option]
end

#parse_images(h = {}) ⇒ Object



103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/static_image_download/parser.rb', line 103

def parse_images(h={})
  begin
    response = nil
    status = Timeout::timeout(@timeout) {
      response = method_to_value(self.parse_option, h)
      collect_images
    }
  rescue => error
    p "#{error}"
    nil
  end
end

#push_image(src) ⇒ Object



122
123
124
# File 'lib/static_image_download/parser.rb', line 122

def push_image(src)
  self.images.push Images.new(src, @path, Images.default_download_option)
end