Class: HRefPreview::Preview
- Inherits:
-
Object
- Object
- HRefPreview::Preview
- Defined in:
- lib/href_preview/preview.rb
Constant Summary collapse
- SANITIZE_OPTIONS =
Instance Attribute Summary collapse
-
#response ⇒ Object
readonly
Returns the value of attribute response.
Instance Method Summary collapse
- #article_html ⇒ Object
- #article_node ⇒ Object
- #article_text ⇒ Object
- #canonical_uri ⇒ Object
-
#charset ⇒ String
Returns the charset declared in the HTTP headers or HTML meta tags.
- #description ⇒ Object
-
#dom ⇒ Nokogiri::HTML::Document
The DOM for the response body.
- #image_uri ⇒ Object
- #images ⇒ Object
-
#initialize(response, connection = DEFAULT_CONNECTION) ⇒ Preview
constructor
Initializes a ‘Preview` from an HTTP response.
- #inspect ⇒ Object
-
#is_html? ⇒ true, false
Returns true if the response had a 2xx HTTP code and the mime type is either HTML or XHTML.
- #item_type ⇒ Object
- #language ⇒ Object
-
#mime_type ⇒ MIME::Type
Returns the MIME type declared in the HTTP headers or HTML meta tags.
- #published ⇒ Object
- #shortlink_uri ⇒ Object
- #site_name ⇒ Object
-
#title ⇒ String
The title of the page.
-
#twitter ⇒ String
The Twitter handle used by the site.
- #updated ⇒ Object
Constructor Details
#initialize(response, connection = DEFAULT_CONNECTION) ⇒ Preview
Initializes a ‘Preview` from an HTTP response.
28 29 30 31 |
# File 'lib/href_preview/preview.rb', line 28 def initialize(response, connection=DEFAULT_CONNECTION) @response = response @connection = connection end |
Instance Attribute Details
#response ⇒ Object (readonly)
Returns the value of attribute response.
33 34 35 |
# File 'lib/href_preview/preview.rb', line 33 def response @response end |
Instance Method Details
#article_html ⇒ Object
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 |
# File 'lib/href_preview/preview.rb', line 342 def article_html @article_html ||= (if is_html? begin html = nil if article_node html = article_node.children.reject do |child| next unless child.attribute('class') [ 'related_links_inline', 'inline-share-btn-label', 'inline-share-btn' ].include?(child.attribute('class').value) end.map(&:to_s).join('') end if html html = Sanitize.clean(html, SANITIZE_OPTIONS) html.gsub!("\r\n", "\n") html.gsub!("\t", " ") html.gsub!(/ *\n */, "\n") html.gsub!(/\n\n+/, "\n\n") html.gsub!(/<p>\n+/, "<p>\n") html.gsub!(/\n+<\/p>/, "\n</p>") html.gsub!(/<\/p>\n+/, "</p>\n") html.strip! # Excise empty elements reparsed = Nokogiri::HTML.fragment(html) excise_empty = lambda do |node| if node.respond_to?(:name) && node.name == "script" node.unlink else node.children.each do |node| excise_empty.call(node) if node.element? end if node.respond_to?(:attribute_nodes) && node.respond_to?(:text) if node.attribute_nodes.size == 0 && node.text.to_s.strip =~ /^\s*$/ && node.children.all? { |child| child.text? } node.unlink end end end end excise_empty.call(reparsed) html = reparsed.to_s end html end end) end |
#article_node ⇒ Object
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 |
# File 'lib/href_preview/preview.rb', line 297 def article_node @article_node ||= (if is_html? begin nodes = dom.xpath("/html[@itemtype='http://schema.org/NewsArticle']//article[@id='story']") nodes.first if nodes.size == 1 end or begin nodes = dom.xpath("//*/*[@itemtype='http://schema.org/NewsArticle']") nodes.first if nodes.size == 1 end or begin nodes = dom.xpath("//*/*[@itemprop='articleBody']") nodes.first if nodes.size == 1 end or begin nodes = dom.css("article div.article-entry") nodes.first if nodes.size == 1 end or begin nodes = dom.css("article.post div.entry-content") nodes.first if nodes.size == 1 end or begin nodes = dom.css("div.post div.postBody") nodes.first if nodes.size == 1 end or begin nodes = dom.css(".pg_story div#leftcolumn div.body") nodes.first if nodes.size == 1 end end) end |
#article_text ⇒ Object
392 393 394 |
# File 'lib/href_preview/preview.rb', line 392 def article_text @article_text ||= is_html? ? Sanitize.clean(article_html) : nil end |
#canonical_uri ⇒ Object
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
# File 'lib/href_preview/preview.rb', line 203 def canonical_uri @canonical_uri ||= (if is_html? begin node = dom.xpath("//*/link[@rel='canonical']/@href").first Addressable::URI.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*/meta[@property='og:url']/@content").first Addressable::URI.parse(node.value) if node && node.value && node.value != '' end or Addressable::URI.parse(response.env.url.to_s) else Addressable::URI.parse(response.env.url.to_s) end) end |
#charset ⇒ String
Returns the charset declared in the HTTP headers or HTML meta tags.
59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/href_preview/preview.rb', line 59 def charset @charset ||= (begin charset = response.headers['Content-Type'].to_s[/;\s*charset=([^;,]*)/, 1] or begin node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first node.value.to_s[/;\s*charset=([^;,]*)/, 1] if node end or begin node = dom.xpath("//*/meta/@charset").first node.value if node end charset.strip if charset end) end |
#description ⇒ Object
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# File 'lib/href_preview/preview.rb', line 162 def description @description ||= (begin if is_html? description = begin node = dom.xpath("//*/meta[@property='og:description']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dc.description']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@itemprop='description']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='description']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dcterms.abstract']/@content").first node.value if node end or begin # Unlikely to ever happen node = dom.xpath("//*/meta[@name='twitter:description']/@content").first node.value if node end or begin # Unlikely to ever happen node = dom.xpath("//*/meta[@name='sailthru.description']/@content").first node.value if node end if description description.gsub!(/ /, ' ') description.strip end end end) end |
#dom ⇒ Nokogiri::HTML::Document
The DOM for the response body.
111 112 113 |
# File 'lib/href_preview/preview.rb', line 111 def dom @dom ||= Nokogiri::HTML(response.body) end |
#image_uri ⇒ Object
232 233 234 |
# File 'lib/href_preview/preview.rb', line 232 def image_uri @image_uri ||= (images.first ? Addressable::URI.parse(images.first.uri) : nil) end |
#images ⇒ Object
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 |
# File 'lib/href_preview/preview.rb', line 236 def images @images ||= (begin image_uris = [] if is_html? nodes = dom.xpath("//*/meta[@property='og:image']/@content") nodes.each do |node| if node && node.value && node.value != '' image_uris << Addressable::URI.parse(node.value) end end if article_node nodes = article_node.xpath("meta[@itemprop='thumbnailurl']/@content") nodes.each do |node| if node && node.value && node.value != '' image_uris << Addressable::URI.parse(node.value) end end end elsif mime_type && mime_type.media_type == 'image' image_uris << canonical_uri end image_uris.uniq.map { |uri| FastImage.new(uri, :timeout => 0.5) } end) end |
#inspect ⇒ Object
446 447 448 449 |
# File 'lib/href_preview/preview.rb', line 446 def inspect addr = '0x' + ('%x' % (object_id << 1)).rjust(14, '0') "#<HRefPreview::Preview:#{addr} TITLE=#{title.inspect}>" end |
#is_html? ⇒ true, false
Returns true if the response had a 2xx HTTP code and the mime type is either HTML or XHTML.
99 100 101 102 103 104 |
# File 'lib/href_preview/preview.rb', line 99 def is_html? return ( response.status >= 200 && response.status < 300 && mime_type && mime_type.sub_type =~ /^x?html/ ) end |
#item_type ⇒ Object
261 262 263 264 265 266 267 268 269 270 271 |
# File 'lib/href_preview/preview.rb', line 261 def item_type @item_type ||= (if is_html? begin node = dom.xpath("//*/meta[@property='og:type']/@content").first node.value if node end or if dom.xpath("//*[@itemtype='http://schema.org/NewsArticle']").first != nil 'article' end end) end |
#language ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/href_preview/preview.rb', line 76 def language @language ||= (begin language = response.headers['Content-Language'] or begin node = dom.xpath("//*/meta[@http-equiv='Content-Language']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dc.language']/@content").first node.value if node end if language # Strip the irrelevant '-US' from 'en-US' if it appears. language[/^([a-z]{2})/, 1].to_s.downcase end end) end |
#mime_type ⇒ MIME::Type
Returns the MIME type declared in the HTTP headers or HTML meta tags.
40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/href_preview/preview.rb', line 40 def mime_type @mime_type ||= (begin MIME::Types[response.headers['Content-Type']].first or begin node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first MIME::Types[node.value].first if node && node.value end or begin node = dom.xpath("//*/meta[@name='dc.format']/@content").first MIME::Types[node.value].first if node && node.value end end) end |
#published ⇒ Object
396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 |
# File 'lib/href_preview/preview.rb', line 396 def published @published ||= (begin # Check under the article node first, otherwise search all begin node = dom.xpath("//*/meta[@property='article:published_time']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or if article_node node = article_node.xpath("meta[@itemprop='datepublished']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*/meta[@itemprop='datepublished']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*/meta[@name='dcterms.created']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin # Only a date, not a time, and not particularly specific, # so this is a fallback at best. node = dom.xpath("//*/meta[@name='dc.date']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end end) end |
#shortlink_uri ⇒ Object
219 220 221 222 223 224 225 226 227 228 229 230 |
# File 'lib/href_preview/preview.rb', line 219 def shortlink_uri @shortlink_uri ||= (if is_html? begin node = dom.xpath("//*/link[@rel='shortlink']/@href").first Addressable::URI.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*[@class='story-short-url']/a/@href").first Addressable::URI.parse(node.value) if node && node.value && node.value != '' end end) end |
#site_name ⇒ Object
273 274 275 276 277 278 279 280 281 282 283 284 |
# File 'lib/href_preview/preview.rb', line 273 def site_name @site_name ||= (if is_html? begin node = dom.xpath("//*/meta[@property='og:site_name']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dc.publisher']/@content").first node.value if node end end) end |
#title ⇒ String
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/href_preview/preview.rb', line 117 def title @title ||= (begin if is_html? title = begin node = dom.xpath("//*/meta[@property='og:title']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dc.title']/@content").first node.value if node end or begin if article_node node = article_node.xpath("*[@itemprop='headline']").first node.text if node end end or begin node = dom.xpath("//*/*[(self::h1 or self::h2) and @itemprop='headline']").first node.text if node end or begin node = dom.xpath("//*/head/title").first node.text if node end or begin # Unlikely to ever happen node = dom.xpath("//*/meta[@name='twitter:title']/@content").first node.value if node end or begin # Unlikely to ever happen node = dom.xpath("//*/meta[@name='sailthru.title']/@content").first node.value if node end if title title.gsub!(/ /, ' ') title.gsub!(/^#{site_name}[\s\|\-\:]*/, '') title.gsub!(/[\s\|\-\:]*#{site_name}$/, '') title.strip end end end) end |
#twitter ⇒ String
288 289 290 291 292 293 294 295 |
# File 'lib/href_preview/preview.rb', line 288 def twitter @twitter ||= (if is_html? begin node = dom.xpath("//*/meta[@name='twitter:site']/@content").first node.value if node && node.value && node.value =~ /^@/ end end) end |
#updated ⇒ Object
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 |
# File 'lib/href_preview/preview.rb', line 424 def updated @updated ||= (begin # Check under the article node first, otherwise search all begin node = dom.xpath("//*/meta[@property='article:modified_time']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or if article_node node = article_node.xpath("meta[@itemprop='datemodified']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("meta[@itemprop='datemodified']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*/meta[@name='dcterms.modified']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end end) end |