Class: WeiDiskCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/version.rb,
lib/wcrawler/wei_disk_crawler.rb

Constant Summary collapse

VERSION =
"0.1"
WEI_DISK_PREFIX =
"http://vdisk.weibo.com/u/"

Instance Method Summary collapse

Constructor Details

#initialize(params = {}) ⇒ WeiDiskCrawler

Returns a new instance of WeiDiskCrawler.



10
11
12
13
14
15
# File 'lib/wcrawler/wei_disk_crawler.rb', line 10

def initialize(params = {})
  raise "Missing User ID" if params[:uid].nil?
  @uid = params[:uid]
  @resources = {}
  @page_total = 0
end

Instance Method Details

#list_all_resourcesObject



26
27
28
29
30
31
# File 'lib/wcrawler/wei_disk_crawler.rb', line 26

def list_all_resources
  (1..max_page_number).each do |page_number|
    @resources = @resources.merge(list_resources_on_page page_number)
  end
  @resources
end

#list_resources_on_page(page_number) ⇒ Object



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/wcrawler/wei_disk_crawler.rb', line 33

def list_resources_on_page page_number
  hydra = Typhoeus::Hydra.new

  page_request = Typhoeus::Request.new(WEI_DISK_PREFIX + "#{@uid}" + "?page=#{page_number}")

  page_request.on_complete do |response|
    page_doc = Nokogiri::HTML(response.response_body)

    resource_urls = page_doc.css('td.sort_name_m div.sort_name_pic a').map { |x| x.attr('href') }
    resource_ids = resource_urls.map { |url| url.split(/\//).last }

    requests = resource_ids.map do |resource_id|
      now_timestamp = Time.now.to_datetime.strftime '%Q'
      request = Typhoeus::Request.new(
          "http://vdisk.weibo.com/api/weipan/fileopsStatCount?link=#{resource_id}&ops=download&_=#{now_timestamp}",
          :method => :get,
          headers: {
              :Accept => "application/json, text/javascript, */*; q=0.01",
              :Referer => "http://vdisk.weibo.com/s/#{resource_id}",
              :'X-Requested-With' => "XMLHttpRequest",
              :'Connection' => "keep-alive",
              :'x-response-version' => "2"
          }
      )
      request.on_complete do |ajax_response|
          p "#{JSON.parse(ajax_response.response_body)['name']} has been parsed successfully."
      end

      hydra.queue request
      request
    end

    hydra.run

    requests.each do |request|
      json = JSON.parse(request.response.response_body)
      @resources[json["name"]] = json["download_list"].first
    end

  end

  hydra.queue page_request
  hydra.run
  p "#"*50
  p "Page #{page_number} has been parsed successfully."
  p "#"*50
  @resources
end

#max_page_numberObject



17
18
19
20
21
22
23
24
# File 'lib/wcrawler/wei_disk_crawler.rb', line 17

def max_page_number
  return @page_total unless @page_total.eql? 0
  request = Typhoeus::Request.new(WEI_DISK_PREFIX + "#{@uid}")
  request.run
  response_body = Nokogiri::HTML(request.response.response_body)
  return @page_total = 1 if response_body.css('.vd_page_main .vd_page').empty?
  @page_total = response_body.css('.vd_page a:nth-last-child(2)').text.to_i
end

#write_resources_to_file(file_name) ⇒ Object



82
83
84
# File 'lib/wcrawler/wei_disk_crawler.rb', line 82

def write_resources_to_file file_name
  File.open("downloads/#{file_name}", 'a') { |file| file.write(list_all_resources) }
end