Class: LinksProcessor::LinksProcessor
- Inherits:
-
Object
- Object
- LinksProcessor::LinksProcessor
- Defined in:
- lib/links_processor.rb
Constant Summary collapse
- MAX_IMAGESIZE =
'512K'
- MIN_IMAGESIZE =
'1K'
- MAX_IMAGES =
10
- ASPECT_RATIO_RANGE =
0.25..4
Instance Method Summary collapse
-
#initialize(type, source = nil) ⇒ LinksProcessor
constructor
A new instance of LinksProcessor.
- #process ⇒ Object
- #process_doc(url, doc_options) ⇒ Object
- #process_link(url, link_options = nil) ⇒ Object
- #process_video(url, video_options) ⇒ Object
- #update_images_sizes(images, parent_uri) ⇒ Object
- #validate_size ⇒ Object
Constructor Details
#initialize(type, source = nil) ⇒ LinksProcessor
Returns a new instance of LinksProcessor.
12 13 14 15 16 17 |
# File 'lib/links_processor.rb', line 12 def initialize(type, source = nil) @type = type source &&= source.capitalize @processor = OEmbed::Providers.const_get(source) if source && OEmbed::Providers.constants.include?(source) @source = source end |
Instance Method Details
#process ⇒ Object
19 20 21 |
# File 'lib/links_processor.rb', line 19 def process self.send :"process_#{@type}" end |
#process_doc(url, doc_options) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/links_processor.rb', line 41 def process_doc(url, ) .merge!(:format => :json) processed_doc = @processor.get(url, ) processed_doc.uri = URI.parse processed_doc.request_url processed_doc.sld = get_sld processed_doc.uri.host doc = Nokogiri::HTML::fragment processed_doc.html if @source == 'Scribd' processed_doc.height = doc.at_xpath('.//object')['height'] elsif @source == 'Slideshare' processed_doc.fields['html'] = doc.at_xpath('.//iframe').to_s end processed_doc end |
#process_link(url, link_options = nil) ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/links_processor.rb', line 60 def process_link(url, = nil) uri = URI.parse url uri_sld = get_sld uri.host data = {:images => [], :title => nil, :description => nil} doc = Nokogiri::HTML(open(uri.to_s)) data[:title] ||= (doc.at_xpath('/html/head/meta[@property=\'og:title\']')['content'] rescue nil) data[:title] ||= (doc.at_xpath('/html/head/meta[@name=\'title\']')['content'] rescue nil) data[:title] ||= (doc.xpath('/html/head/title').text rescue nil) data[:description] = doc.at_xpath('/html/head/meta[@name=\'description\']')['content'] rescue nil data[:sld] = uri_sld data[:uri] = uri image = doc.at_xpath('/html/head/meta[@property=\'og:image\']')['content'] rescue nil if image data[:images] << image else images = doc.xpath('/html/body//img') unsized_images = [] # filter out unsized images images = images.select do |img| width,height = ['width','height'].map{|x| img[x] && img[x].chomp('px')} valid_size = width && height && width =~ /\d+/ && height =~ /\d+/ unsized_images << img unless valid_size valid_size end if images.empty? images += update_images_sizes(unsized_images, uri) end images.each do |img| break if data[:images].count > MAX_IMAGES img_uri = URI.parse img['src'] rescue nil # Skip if invalid image src or dimensions next unless img_uri width, height = img['width'].to_i, img['height'].to_i # Skip if ads images next if img_uri.to_s =~ /(ad|ads)\./i # Skip if not matching image constraints next if width == 0 || height == 0 || width < 50 || height < 50 || ASPECT_RATIO_RANGE.exclude?(width/height) # make image url host and protocol same as page if the src is not a complete url img_uri.path = uri.path if img_uri.relative? && !uri.host img_uri.scheme = uri.scheme unless img_uri.scheme img_uri.host = uri.host unless img_uri.host img_sld = get_sld img_uri.host # Skip if second level domain of images is not equal to page second level domain example static.example.com and www.example.com match # as both have sld example.com next unless img_sld == uri_sld unless img_sld data[:images] << img_uri.to_s end end data end |
#process_video(url, video_options) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/links_processor.rb', line 23 def process_video(url, ) .merge!(:format => :json) processed_video = @processor.get(url, ) processed_video.uri = URI.parse processed_video.request_url processed_video.sld = get_sld processed_video.uri.host if @source == 'Youtube' doc = Nokogiri::HTML::fragment processed_video.html if doc.at_xpath('.//param[@name=\'movie\']') doc.at_xpath('.//param[@name=\'movie\']')['value'] += '&autoplay=1' doc.at_xpath('.//embed')['src'] += '&autoplay=1' elsif doc.at_xpath('.//iframe') doc.at_xpath('.//iframe')['src'] += '&autoplay=1' end processed_video.fields['html'] = doc.to_html end processed_video end |
#update_images_sizes(images, parent_uri) ⇒ Object
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# File 'lib/links_processor.rb', line 123 def update_images_sizes(images, parent_uri) sized_images = [] images.select do |img| break if sized_images.size > 4 img_uri = URI.parse(img['src']) rescue nil next unless img_uri # Try to find if filename itself ends with size format such as 100x35 .. else a request is made if match = /(\d{1,4})(x|X)(\d{1,4})$/.match(File.basename(img_uri.path, '.*')) img['width'] = match[1] img['height'] = match[3] else img_uri.scheme = parent_uri.scheme unless img_uri.scheme img_uri.host = parent_uri.host unless img_uri.host open(img_uri.to_s, "rb") do |fh| img_size = ImageSize.new(fh.read) img['width'] = img_size.get_width.to_s unless img_size.get_width.nil? img['height'] = img_size.get_height.to_s unless img_size.get_height.nil? end end sized_images << img if img['width'] && img['height'] end sized_images end |
#validate_size ⇒ Object
147 148 149 150 151 152 153 |
# File 'lib/links_processor.rb', line 147 def validate_size response = nil Net::HTTP.start('www.biostat.wisc.edu', 80) {|http| response = http.head('/bcg/categories/languages/ruby/ruby_logo.png') } p response['content-length'] end |