Class: LinkScrapper

Inherits:
Object
  • Object
show all
Defined in:
lib/link_scrapper.rb

Overview

class for grabbing and parsing domain links

Instance Method Summary collapse

Constructor Details

#initialize(settings) ⇒ LinkScrapper

Returns a new instance of LinkScrapper.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/link_scrapper.rb', line 10

def initialize(settings)

  # available default settings
  # domain: domain to be searched
  # verbose: prints output as the script goes along
  # results: hash or csv

  # init link store hashes
  @settings = settings;
  @search_index = 0
  @search_iteration = 0
  @links = Array.new
  @link_parents = Hash.new
  @checked_links = Hash.new
  @error_links = Hash.new
  @external_links = Hash.new

  # gather search domain
  if ARGV[0]
    @search_domain = ARGV[0].dup
  elsif @settings[:domain] == 'ue'
    puts "Please enter a domain to search: (Default: #{SEARCH_DOMAIN})"
    @search_domain = gets.chomp
  elsif @settings[:domain]
    @search_domain = @settings[:domain]
  end

  # override with default domain if entry is left empty
  @search_domain = SEARCH_DOMAIN if @search_domain == ''

  # get and store local domain string
  @local_domain = @search_domain.match(/\w+\.\w+(?=\/|\s|$)/)

  # configure initial search uri
  @search_uri = @search_domain

  # verify domain entry includes protocol
  if @search_uri !~ /^htt(p|ps):/
    @search_uri.insert(0, 'http://')
  end

  # verify leading forward slash
  if @search_uri[@search_uri.length-1] != '/'
    @search_uri << '/'
  end

  # start scan
  get_links
end

Instance Method Details

gather link data



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/link_scrapper.rb', line 151

def get_links

  # init skip bit
  @skip = 0

  # define search uri if undefined
  get_search_uri

  # check for existing uri hash index
  if @checked_links[@search_uri.to_sym]
    @skip = 1
  end

  # run link scan if @skip bit is not set
  if @skip == 0

    # let user know which uri is currently active
    puts @search_uri if @settings[:verbose]

    # gather page request response
    begin
      t1 = Time.now
      response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri.strip)))
      t2 = Time.now
      delta = t2 - t1

      # store response page body
      body = response.body

      # store response code
      code = response.code

      # extract all links within page
      links_array = body.scan(/<a[^>]+href\s*=\s*["']([^"']+)["'][^>]*>(.*?)<\/a>/mi)

      # update anchors and indirect links to use direct links
      links_array.each { |val|
        if (val[0][0,2] == "//" || val[0][0] == "/" || val[0][0,3] == "../") && val[0] !~ /^htt(p|ps):/
          if val[0][0,3] == "../"
            val[0][0,3] = ""
          end
          if val[0][0,2] == "//"
            val[0][0,2] = ""
          end
          if val[0][0] == "/"
            val[0][0] = ""
          end
          val[0] = "#{@search_domain}#{val[0]}"
        end
        @link_parents[val[0].chomp.to_sym] = @search_uri.strip
      }

      # combine found links with links array
      @links.concat(links_array)

      # remove duplicates
      @links.uniq!

    rescue => ex
      rescode = 408
    end

    # store results in checked hash
    @checked_links[@search_uri.to_sym] = {res: code, time: delta, parent: @link_parents[@search_uri.to_sym]}

  end

  # iterate through found links
  @search_iteration += 1
  get_links

end

#get_search_uriObject

gather search uri



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/link_scrapper.rb', line 61

def get_search_uri
  # do not override initial domain setting
  if @search_iteration > 0
    # set search uri
    if !@links[@search_index].nil?
      @search_uri = @links[@search_index][0].chomp
    else
      # save results and exit
      if @settings[:results] == 'csv'
        save_results
      else
        return { checked_links: @checked_links, error_links: @error_links, external_links: @external_links}
      end
      exit
    end

    # check for direct link
    if @search_uri =~ /^htt(p|ps):/

      # if external link go to next link
      if @search_uri.index(@local_domain[0]) == nil
        if !@external_links[@search_uri.to_sym]
          begin
            t1 = Time.now
            response = Net::HTTP.get_response(URI.parse(URI.encode(@search_uri)))
            t2 = Time.now
            delta = t2 - t1
            code = response.code
          rescue => ex
            code = 408
          end
          @external_links[@search_uri.to_sym] = {res: code, time: delta}
        end
        @skip = 1
      end
    else

      # skip various files
      if @search_uri =~ /[^\s]+(\.(?i)flv|gif|jpg|png|mp3|mp4|m4v|pdf|zip|txt)$/
        @skip = 1
      end

      # check for mailto link
      if @search_uri[0,7] == 'mailto:' || @search_uri[0,4] == 'tel:'
        @skip = 1
      else
        # check for protocol agnostic and indirect links
        case @search_uri[0,1]
        when '.'
          @search_uri[0,1] = ''
        end
        case @search_uri[0,2]
        when '//', './', '..'
          @search_uri[0,2] = ''
        end
        case @search_uri[0,3]
        when '../'
          @search_uri[0,3] = ''
        end
        # check for relative link
        if @search_uri[0] == '/'
          @search_uri[0] = ''
        end
        # verify uri portion is valid
        if @search_uri !~ /^([\w]|%|#|\?)/
          @search_index += 1
          @skip = 1
          @error_links[@search_uri] = ''
          puts "invalid uri #{@search_uri}" if @settings[:verbose]
          return
        end
        # define uri string
        if @search_uri[0,2] != '//'
          @search_uri = "#{@search_domain}#{@search_uri}"
        else
          # handle protocol agnostic link requests
          if @search_domain[0,6] == 'https:'
            @search_uri = "https:#{@search_uri}"
          else
            @search_uri = "http:#{@search_uri}"
          end
        end
      end
    end
    # increment search index value
    @search_index += 1
  end
end

#save_resultsObject

save results to csvs



225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/link_scrapper.rb', line 225

def save_results
  # save search results
  CSV.open('results.csv', 'wb') {|csv|
    @checked_links.each {|link|
      csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]]
    }
  }
  # save list of external links
  CSV.open('external-links.csv', 'wb') {|csv|
    @external_links.each do |link|
       csv << [link[0], link[1][:res], link[1][:time], link[1][:parent]]
    end
  }
  # save list of invalid links
  CSV.open('invalid.csv', 'wb') {|csv|
    @error_links.each do |link|
       csv << link
    end
  }
end