Class: Tango::App

Inherits:
Object
  • Object
show all
Defined in:
lib/tango/app.rb

Overview

Tango application

Author:

  • Mckomo

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil) ⇒ Tango::App



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/tango/app.rb', line 20

def initialize( config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil )
  
  # Init app properties
  @models = {}
  @operators = {}
  
  # Set config
  @config = config
  
  # Set dependencies
  @link_stack = link_stack || LinkStack.new( config['target_url'] )
  @dispatcher = dispatcher || ETL::Dispatcher.new
  @cache = cache || Resource::Cache.new( Resource::Buffer.new )
  @http_client = http_client || HTTParty
  @parser = parser || Nokogiri::HTML
  @db_locker = db_locker || DatabaseLocker.new( Multidb.databases )
  @logger = logger || Logger.new( STDOUT )
  
end

Instance Attribute Details

#configObject (readonly)

Returns the value of attribute config.



10
11
12
# File 'lib/tango/app.rb', line 10

def config
  @config
end

#dispatcherObject (readonly)

Returns the value of attribute dispatcher.



10
11
12
# File 'lib/tango/app.rb', line 10

def dispatcher
  @dispatcher
end

Returns the value of attribute link_stack.



10
11
12
# File 'lib/tango/app.rb', line 10

def link_stack
  @link_stack
end

#loggerObject (readonly)

Returns the value of attribute logger.



10
11
12
# File 'lib/tango/app.rb', line 10

def logger
  @logger
end

Instance Method Details

#afterObject

Filter run after Tango execution



45
46
# File 'lib/tango/app.rb', line 45

def after
end

#beforeObject

Filter run before Tango execution



41
42
# File 'lib/tango/app.rb', line 41

def before
end

#register_model(symbol, model) ⇒ Object

Register new resource model



52
53
54
55
56
57
58
59
60
61
# File 'lib/tango/app.rb', line 52

def register_model( symbol, model )
  
  @models[symbol] = model
  
  # Truncate table of non persistent model
  unless model.persistent?
    ActiveRecord::Base.connection.execute( "TRUNCATE #{model.table_name}" )
  end
  
end

#register_operator(symbol, operator) ⇒ Object

Register new resource operator



67
68
69
70
71
72
73
74
75
76
# File 'lib/tango/app.rb', line 67

def register_operator( symbol, operator )
  
  @operators[symbol] = operator
  
  # Register operator with resource cache system
  @cache.register( symbol ) do |resource|
    operator.load( resource )
  end
  
end

#runInteger

Run ETL process



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/tango/app.rb', line 87

def run

  # Save beginning time
  start_time = Time.now
  
  @logger.info "Running Tango v.#{Tango::VERSION} ..."
  @logger.info "Target: #{@link_stack.host}."
  
  # Use next unlocked database
  Multidb.use( @db_locker.unlocked )
  @logger.info "Using database '#{@db_locker.unlocked}'."
  
  # Run before filter
  @logger.info "Loading cache ..."
  load_cache
  
  # Run before filter
  @logger.info "Running before callback ..."
  before
  
  # Init counter of crawled links
  links_counter = 0
  @logger.info "Tango starts crawling ..."
  
  # Start crawling website
  while( @link_stack.has_links? )
  
    # Get a link from the stack
    link = @link_stack.shift 
    
    # Skip iteration if no handler found
    if ! handler_klass = @dispatcher.find_handler( link )
      @logger.error "No handler for link: #{link}."
      next
    end
    
    # Try to get contents of the link
    begin 
      response = @http_client.get( @link_stack.host + link )
    rescue StandardError => e 
      @logger.error "Could not download contents of #{@link_stack.host + link} link."; @logger.error e.message
      next
    end
    
    # Continue only when response has code 200 or 201
    if ! [ 200, 201 ].include?( response.code )
      @logger.error "Response code for link #{link} is #{response.code}. Only code 200 is accepted."
      next
    end
    
    # Parse response contents
    document = @parser.parse( response.body )
    # Init handler
    handler = handler_klass.new( link, document, @cache )
    
    # Append links fetched from handler
    @link_stack.append( handler.links )
    
    # Try to fire the handler
    begin
      handler.trigger
    rescue StandardError => e  
      # Log error
      @logger.error "Link: #{link}. Handler had some troubles."
      @logger.error e.message
      @logger.error e.backtrace.join( "\n" )
    else
      links_counter += 1
      @logger.debug "Link: #{link}. Handler triggered successfully."
    end
    
    # Sleep to give crawled server time to breath
    sleep( @config["sleep"] || 0 )
    
  end
  
  # Release buffers
  @logger.info "Releasing buffers ..."
  @cache.buffer.release_all()
  
  # Run after filter
  @logger.info "Running after callback ..."
  after
  
  # Lock database used in this Tango iteration
  @db_locker.lock( @db_locker.unlocked )

  # Get time of script execution ending
  end_time = Time.now
  
  @logger.info "Tango crawled #{links_counter}/#{@link_stack.shifted} links successfully."
  @logger.info "Start time: #{start_time}, end time: #{end_time}, time elapsed: #{end_time - start_time} seconds."
  
  # Close logger
  @logger.close
        
end