Module: Spider::ActiveRecordMethods::ClassMethods

Defined in:
lib/spider/active_record_methods.rb

Instance Method Summary collapse

Instance Method Details

#create_from_url(url, force = false) ⇒ Object

从url中创建需要自身实现了 receive_spider_page 方法才可以正确调用



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/spider/active_record_methods.rb', line 14

def create_from_url(url,force=false)
  if !force && ::SpiderPage.find_by_url(url)
    raise "此URL已经存在于系统中了。"
  end
  # 找到能够处理 title跟body的page类
  pages = Spider::Page.find_all_by_url(url)

  object = nil
  raise "没有找到合适的规则" if pages.empty?

  Spider::Site.logger.info "采集单个#{human_name} #{url},找到 适合的规则 #{pages.inspect}"
  pages.each do |page|

    page.logger.info "使用 #{page} 的规则来尝试采集"
    begin
      spider_page = page.new(url)
      results = spider_page.publish_to(self)
      object = results.first
    rescue Exception=>e
      logger.error e.message
      logger.error e.backtrace.join("\n")
      object = nil
    end
    if object.try(:valid?)
      page.logger.info "采集成功"
      # 保存url
      spider_page.save
      break
    else
      page.logger.info "采集失败: #{object.try(:errors).try(:full_messages).try(:first)}"
      object = nil
    end
  end

  unless object
    raise "采集器了找到了规则 #{pages.inspect}, 但是都失败了."
  end
  object

end