Class: LinksProcessor::LinksProcessor

Inherits:
Object
  • Object
show all
Defined in:
lib/links_processor.rb

Constant Summary collapse

MAX_IMAGESIZE =
'512K'
MIN_IMAGESIZE =
'1K'
MAX_IMAGES =
10
ASPECT_RATIO_RANGE =
0.25..4

Instance Method Summary collapse

Constructor Details

#initialize(type, source = nil) ⇒ LinksProcessor

Returns a new instance of LinksProcessor.



12
13
14
15
16
17
# File 'lib/links_processor.rb', line 12

def initialize(type, source = nil)
  @type = type
  source &&= source.capitalize
  @processor = OEmbed::Providers.const_get(source) if source && OEmbed::Providers.constants.include?(source)
  @source = source
end

Instance Method Details

#processObject



19
20
21
# File 'lib/links_processor.rb', line 19

def process
  self.send :"process_#{@type}"
end

#process_doc(url, doc_options) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/links_processor.rb', line 41

def process_doc(url, doc_options)
  doc_options.merge!(:format => :json)
  processed_doc =  @processor.get(url, doc_options)
  processed_doc.uri = URI.parse processed_doc.request_url
  processed_doc.sld = get_sld processed_doc.uri.host
  doc = Nokogiri::HTML::fragment processed_doc.html
  if @source == 'Scribd'
    processed_doc.height = doc.at_xpath('.//object')['height']
  elsif @source == 'Slideshare'
    processed_doc.fields['html'] =  doc.at_xpath('.//iframe').to_s
  end
  processed_doc
end


60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/links_processor.rb', line 60

def process_link(url, link_options = nil)
  uri = URI.parse url
  uri_sld = get_sld uri.host
  data = {:images => [], :title => nil, :description => nil}
  doc = Nokogiri::HTML(open(uri.to_s))

  data[:title] ||= (doc.at_xpath('/html/head/meta[@property=\'og:title\']')['content'] rescue nil)
  data[:title] ||= (doc.at_xpath('/html/head/meta[@name=\'title\']')['content'] rescue nil)
  data[:title] ||= (doc.xpath('/html/head/title').text rescue nil)
  data[:description] = doc.at_xpath('/html/head/meta[@name=\'description\']')['content'] rescue nil
  data[:sld] = uri_sld
  data[:uri] = uri
  image = doc.at_xpath('/html/head/meta[@property=\'og:image\']')['content'] rescue nil
  if image
    data[:images] << image
  else
    images =  doc.xpath('/html/body//img')
    unsized_images = []
    
    # filter out unsized images
    images = images.select do |img|
      width,height = ['width','height'].map{|x| img[x] && img[x].chomp('px')}
      valid_size = width && height && width =~ /\d+/ && height =~ /\d+/
      unsized_images << img unless valid_size
      valid_size
    end
    
    if images.empty?
      images += update_images_sizes(unsized_images, uri)
    end
    
    images.each do |img|
      break if data[:images].count > MAX_IMAGES
      img_uri = URI.parse img['src'] rescue nil
      
      # Skip if invalid image src or dimensions
      next unless img_uri
    
      width, height = img['width'].to_i, img['height'].to_i
    
      # Skip if ads images
      next if img_uri.to_s =~ /(ad|ads)\./i
    
      # Skip if not matching image constraints
      next if width == 0 || height == 0 || width < 50 || height < 50 || ASPECT_RATIO_RANGE.exclude?(width/height)
      
      # make image url host and protocol same as page if the src is not a complete url
      img_uri.path = uri.path if img_uri.relative? && !uri.host
      img_uri.scheme = uri.scheme unless img_uri.scheme
      img_uri.host = uri.host unless img_uri.host
      
      img_sld = get_sld img_uri.host
    
      # Skip if second level domain of images is not equal to page second level domain example static.example.com and www.example.com match
      # as both have sld example.com
      next unless img_sld == uri_sld unless img_sld
      
      data[:images] << img_uri.to_s
    end
  end
  data
end

#process_video(url, video_options) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/links_processor.rb', line 23

def process_video(url, video_options)
  video_options.merge!(:format => :json)
  processed_video = @processor.get(url, video_options)
  processed_video.uri = URI.parse processed_video.request_url
  processed_video.sld = get_sld processed_video.uri.host
  if @source == 'Youtube'
    doc = Nokogiri::HTML::fragment processed_video.html
    if doc.at_xpath('.//param[@name=\'movie\']')
      doc.at_xpath('.//param[@name=\'movie\']')['value'] += '&autoplay=1'
      doc.at_xpath('.//embed')['src'] += '&autoplay=1'
    elsif doc.at_xpath('.//iframe')
      doc.at_xpath('.//iframe')['src'] += '&autoplay=1'
    end
    processed_video.fields['html'] = doc.to_html
  end
  processed_video
end

#update_images_sizes(images, parent_uri) ⇒ Object



123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/links_processor.rb', line 123

def update_images_sizes(images, parent_uri)
  sized_images = []
  images.select do |img|
    break if sized_images.size > 4
    img_uri = URI.parse(img['src']) rescue nil
    next unless img_uri
    # Try to find if filename itself ends with size format such as 100x35 .. else a request is made
    if match = /(\d{1,4})(x|X)(\d{1,4})$/.match(File.basename(img_uri.path, '.*'))
       img['width'] = match[1]
       img['height'] =  match[3]
    else 
      img_uri.scheme = parent_uri.scheme unless img_uri.scheme
      img_uri.host = parent_uri.host unless img_uri.host
      open(img_uri.to_s, "rb") do |fh|
        img_size = ImageSize.new(fh.read)
        img['width'] = img_size.get_width.to_s unless img_size.get_width.nil?
        img['height'] =  img_size.get_height.to_s unless img_size.get_height.nil?
      end
    end
    sized_images << img if img['width'] && img['height']
  end
  sized_images
end

#validate_sizeObject



147
148
149
150
151
152
153
# File 'lib/links_processor.rb', line 147

def validate_size
  response = nil
  Net::HTTP.start('www.biostat.wisc.edu', 80) {|http|
    response = http.head('/bcg/categories/languages/ruby/ruby_logo.png')
  }
  p response['content-length']
end