Class: Indexable::Middleware

Inherits:
Object
  • Object
show all
Defined in:
lib/indexable/indexable.rb

Constant Summary collapse

CRAWLER_USER_AGENTS =
[
  /^Twitterbot/, /^curl/, /Googlebot/, /Mediapartners/, /Adsbot-Google/,
  /\(.*http(s|\(s\))?:\/\/.*\)/
]

Instance Method Summary collapse

Constructor Details

#initialize(app) ⇒ Middleware

Returns a new instance of Middleware.



11
12
13
# File 'lib/indexable/indexable.rb', line 11

def initialize(app)
  @app = app
end

Instance Method Details

#call(env) ⇒ Object



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/indexable/indexable.rb', line 26

def call(env)
  status, headers, content = *@app.call(env)

  if status == 200 and headers['Content-Type'].match(/^text\/html/) and request_from_crawler?(env)
    script = ::File.dirname(__FILE__) + "/render_page.js"
    file = Tempfile.new(['indexable', '.html'])

    if content.respond_to? :body
      html = content.body
    else
      html = content.join('')
    end

    file.write html
    file.close
    begin
      url = Rack::Request.new(env).url
      content = [Phantomjs.new(script, file.path, url).run]
      status = 500 if content[0] == "Couldn't render page... orz."
    ensure
      file.unlink
    end
  end

  [status, headers, content]
end

#request_from_crawler?(env) ⇒ Boolean

Detect whether the current request comes from a bot. Based on the logic used by Bustle.com (www.dropbox.com/s/s4oibqsxqpo3hll/bustle%20slizzle.pdf)

Returns:

  • (Boolean)


17
18
19
20
21
22
23
24
# File 'lib/indexable/indexable.rb', line 17

def request_from_crawler?(env)
  user_agent  = env["HTTP_USER_AGENT"]
  params      = Rack::Request.new(env).params
  return false  unless user_agent
  return true   if CRAWLER_USER_AGENTS.any? {|s| user_agent.match(s) }
  return true   if params.has_key?('_escaped_fragment_')
  params['nojs'].eql?('true')
end