Class: Retriever::Fetch

Inherits:
Object
  • Object
show all
Defined in:
lib/retriever/fetch.rb

Direct Known Subclasses

FetchFiles, FetchSEO, FetchSitemap, PageIterator

Constant Summary collapse

HR =
'###############################'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, options) ⇒ Fetch

given target URL and RR options, creates a fetch object. There is no direct output this is a parent class that the other fetch classes build off of.



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/retriever/fetch.rb', line 17

def initialize(url, options)
  @iterator = false
  @result = []
  @connection_tally = {
    success: 0,
    error: 0,
    error_client: 0,
    error_server: 0
  }
  setup_options(options)
  setup_progress_bar if @progress
  @t = Retriever::Target.new(url, @file_re)
  @output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
  @already_crawled = setup_bloom_filter
end

Instance Attribute Details

#max_pagesObject (readonly)

Returns the value of attribute max_pages.



13
14
15
# File 'lib/retriever/fetch.rb', line 13

def max_pages
  @max_pages
end

#resultObject (readonly)

Returns the value of attribute result.



13
14
15
# File 'lib/retriever/fetch.rb', line 13

def result
  @result
end

#tObject (readonly)

Returns the value of attribute t.



13
14
15
# File 'lib/retriever/fetch.rb', line 13

def t
  @t
end

Instance Method Details

#dumpObject

prints current data collection to STDOUT



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/retriever/fetch.rb', line 48

def dump
  puts HR
  puts "Connection Tally:\n#{@connection_tally}\n#{HR}" if @verbose
  puts "Target URL: #{@t.target}"
  if @sitemap
    puts 'Sitemap'
  elsif @fileharvest
    puts "File harvest by type: #{@fileharvest}"
  elsif @seo
    puts 'SEO Metrics'
  end
  puts "Data Dump -- Object Count: #{@result.size}"
  puts HR
  @result.each do |line|
    puts line
  end
  puts
end

#errlog(msg) ⇒ Object



39
40
41
# File 'lib/retriever/fetch.rb', line 39

def errlog(msg)
  fail "ERROR: #{msg}"
end

#good_response?(resp, url) ⇒ Boolean

returns true is resp is ok to continue

Returns:

  • (Boolean)


88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/retriever/fetch.rb', line 88

def good_response?(resp, url)
  return false unless resp
  hdr = resp.response_header
  if hdr.redirection?
    loc = hdr.location
    lg("#{url} Redirected to #{loc}")
    if t.host_re =~ loc
      @temp_link_stack.push(loc) unless @already_crawled.include?(loc)
      lg('--Added to stack for later')
      return false
    end
    lg("Redirection outside of target host. No - go. #{loc}")
    return false
  end
  # lets not continue if unsuccessful connection
  unless hdr.successful?
    lg("UNSUCCESSFUL CONNECTION -- #{url}")
    @connection_tally[:error] += 1
    @connection_tally[:error_server] += 1 if hdr.server_error?
    @connection_tally[:error_client] += 1 if hdr.client_error?
    return false
  end
  # let's not continue if not text/html
  unless hdr['CONTENT_TYPE'] =~ %r{(text/html|application/xhtml+xml)}
    @already_crawled.insert(url)
    lg("Page Not text/html -- #{url}")
    return false
  end
  @connection_tally[:success] += 1
  true
end

#lg(msg) ⇒ Object



43
44
45
# File 'lib/retriever/fetch.rb', line 43

def lg(msg)
  puts "### #{msg}" if @verbose
end

#startObject



33
34
35
36
37
# File 'lib/retriever/fetch.rb', line 33

def start
  @page_one = crawl_page_one
  @link_stack = create_link_stack
  @temp_link_stack = []
end

#writeObject

writes current data collection out to CSV in current directory



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/retriever/fetch.rb', line 68

def write
  return false unless @output
  i = 0
  CSV.open("#{@output}.csv", 'w') do |csv|
    if (i == 0) && @seo
      csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
      i += 1
    end
    @result.each do |entry|
      csv << entry
    end
  end
  puts HR
  puts "File Created: #{@output}.csv"
  puts "Object Count: #{@result.size}"
  puts HR
  puts
end