Class: Newrank

Inherits:

Object

Object
Newrank

show all

Defined in:: lib/newrank.rb

Instance Method Summary collapse

#_md5(str) ⇒ Object

use js md5 algorightm, written by newrank, file in assets/newrank_md5.js.
#crawl(newrank_id) ⇒ Object

crawl newrank info.
#document(newrank_account) ⇒ Object

get Nogogiri Document.
#fetch_post(uuid) ⇒ Object

crawl posts.
#gen_nonce ⇒ Object

generate parameter nonce.
#gen_xyz(nonce, uuid) ⇒ Object

generate parameter xyz.
#js_context ⇒ Object

js context.
#score_and_uuid(doc) ⇒ Object

find score and uuid.
#wait_for_seconds ⇒ Object

wait for seconds instead of request too much.
#week_data(doc) ⇒ Object

crawl week data.

Instance Method Details

#_md5(str) ⇒ `Object`

use js md5 algorightm, written by newrank, file in assets/newrank_md5.js



140
141
142

# File 'lib/newrank.rb', line 140

def _md5(str)
  js_context.call('newrank_md5', str, bare: true)
end

#crawl(newrank_id) ⇒ `Object`

crawl newrank info

# File 'lib/newrank.rb', line 10

def crawl(newrank_id)
  doc = document(newrank_id.gsub("\u{a0}",""))
  if !doc.nil?
    score, uuid = score_and_uuid(doc)

    element = doc.css(".detail-fans-counts")[0]
    active_users_count = element.nil? ? 0 : element.text.gsub(",","").to_i

    element = doc.css(".info-detail-head-weixin-fun-introduce")[0]
    introduce = element.nil? ? "" : element.text

    week_data = week_data(doc)
    if !uuid.nil?
      posts_data = fetch_post(uuid)
    end
    {
      active_users_count: active_users_count,
      score: (score || 0),
      introduce: introduce,
      week_data: week_data,
      posts_data: (posts_data || {})
    }
  else
    {
      active_users_count: 0,
      score: 0,
      introduce: "",
      week_data: [],
      posts_data: {}
    }
  end
end

#document(newrank_account) ⇒ `Object`

get Nogogiri Document

# File 'lib/newrank.rb', line 70

def document(newrank_account)
  wait_for_seconds
  url = 'http://www.newrank.cn/public/info/detail.html?account=' + newrank_account
  Nokogiri::HTML(open(url, "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", :read_timeout => 10), nil, 'utf-8')
end

#fetch_post(uuid) ⇒ `Object`

crawl posts

# File 'lib/newrank.rb', line 44

def fetch_post(uuid)
  nonce = gen_nonce
xyz = gen_xyz(nonce, uuid)

  wait_for_seconds

  posts = JSON.parse(RestClient.post("http://www.newrank.cn/xdnphb/detail/getAccountArticle", {uuid: uuid, nonce: nonce, xyz: xyz, flag: true}, {"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36"}))
end

#gen_nonce ⇒ `Object`

generate parameter nonce

# File 'lib/newrank.rb', line 117

def gen_nonce
	a = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a","b", "c", "d", "e", "f"]
	b = 0
	while 500 > b
		d = 0
		c = ""
		while 9 > d
			e = (16 * rand).floor
			c << a[e]
			d = d + 1
		end
		b = b + 1
	end
   c
end

#gen_xyz(nonce, uuid) ⇒ `Object`

generate parameter xyz

# File 'lib/newrank.rb', line 134

def gen_xyz(nonce, uuid)
   h = "/xdnphb/detail/getAccountArticle?AppKey=joker&flag=true&uuid=#{uuid}&nonce=#{nonce}"
  _md5(h)
end

#js_context ⇒ `Object`

js context

# File 'lib/newrank.rb', line 145

def js_context
  file_path = File.join( File.dirname(__FILE__), 'assets/newrank_md5.js')
  @context ||= ExecJS.compile(File.read(file_path))
end

#score_and_uuid(doc) ⇒ `Object`

find score and uuid

# File 'lib/newrank.rb', line 77

def score_and_uuid(doc)
  score, uuid = nil

  script = doc.css("script[type='text/javascript']")[0]
  if !script.nil?
    parser = RKelly::Parser.new
    ast = parser.parse(script.text.strip)

    # 找到第一个数组节点
    array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first

    # 找到数组节点内地第一个Element Node并寻找Score
    element_node = array_node.pointcut(RKelly::Nodes::ElementNode).matches.first
    json_data = element_node.nil? ? {} : JSON.parse(element_node.to_ecma)
    if json_data["new_rank_index_mark"]
      score = json_data["new_rank_index_mark"].to_f
    else
      score = 0.0
    end

    # 找到有UUID的Node
    object_node = ast.pointcut(RKelly::Nodes::VarDeclNode).matches.select{|node| node.name == "fgkcdg"}.first
    unless object_node.nil?
      node = object_node.pointcut(RKelly::Nodes::PropertyNode).matches.select{|n| n.name == '"uuid"'}.first.value
      uuid = node.value[1..-2]
    else
      uuid = "uuid nil"
    end
  end

  return score, uuid
end

#wait_for_seconds ⇒ `Object`

wait for seconds instead of request too much



112
113
114

# File 'lib/newrank.rb', line 112

def wait_for_seconds
	sleep(1 * rand + 1)
end

#week_data(doc) ⇒ `Object`

crawl week data

# File 'lib/newrank.rb', line 54

def week_data(doc)
  data = []

  if !doc.css("script")[0].nil?
    parser = RKelly::Parser.new
    ast = parser.parse(doc.css("script")[0].text.strip)
    array_node = ast.pointcut(RKelly::Nodes::ArrayNode).matches.first
    array_node.pointcut(RKelly::Nodes::ElementNode).matches.each do |element_node|
      data << JSON.parse(element_node.to_ecma)
    end
  end

  data
end

Class: Newrank

Instance Method Summary collapse

Instance Method Details

#_md5(str) ⇒ Object

#crawl(newrank_id) ⇒ Object

#document(newrank_account) ⇒ Object

#fetch_post(uuid) ⇒ Object

#gen_nonce ⇒ Object

#gen_xyz(nonce, uuid) ⇒ Object

#js_context ⇒ Object

#score_and_uuid(doc) ⇒ Object

#wait_for_seconds ⇒ Object

#week_data(doc) ⇒ Object