Class: Wmap::HostTracker::PrimaryHost

Inherits:
Wmap::HostTracker show all
Includes:
Singleton, Utils
Defined in:
lib/wmap/host_tracker/primary_host.rb

Overview

Class to differentiate the primary host-name from the potential aliases. This is needed in order to minimize the confusion on our final site inventory list, as it contains a large number of duplicates (aliases). More specifically, a filter could be built by using this class to track the primary url of a website.

Constant Summary

Constants included from Utils::UrlMagic

Utils::UrlMagic::Max_http_timeout, Utils::UrlMagic::User_agent

Constants included from Utils::DomainRoot

Utils::DomainRoot::File_ccsld, Utils::DomainRoot::File_cctld, Utils::DomainRoot::File_gtld, Utils::DomainRoot::File_tld

Instance Attribute Summary collapse

Attributes inherited from Wmap::HostTracker

#alias, #max_parallel

Instance Method Summary collapse

Methods included from Utils

#cidr_2_ips, #file_2_hash, #file_2_list, #get_nameserver, #get_nameservers, #host_2_ip, #host_2_ips, #is_cidr?, #is_fqdn?, #is_ip?, #list_2_file, #reverse_dns_lookup, #sort_ips, #valid_dns_record?, #zone_transferable?

Methods included from Utils::Logger

#wlog

Methods included from Utils::UrlMagic

#create_absolute_url_from_base, #create_absolute_url_from_context, #host_2_url, #is_site?, #is_ssl?, #is_url?, #landing_location, #make_absolute, #normalize_url, #open_page, #redirect_location, #response_code, #response_headers, #url_2_host, #url_2_path, #url_2_port, #url_2_site, #urls_on_same_domain?

Methods included from Utils::DomainRoot

#get_domain_root, #get_domain_root_by_ccsld, #get_domain_root_by_cctld, #get_domain_root_by_tlds, #get_sub_domain, #is_domain_root?, #print_ccsld, #print_cctld, #print_gtld

Methods inherited from Wmap::HostTracker

#add, #bulk_add, #bulk_delete, #count, #delete, #dump_sub_domains, #file_add, #file_delete, #get_a_records, #get_root_domains, #host_aliases, #host_known?, #ip_known?, #load_known_hosts_from_file, #local_host_2_ip, #local_ip_2_host, #print_host, #print_known_hosts, #refresh, #refresh_all, #save_known_hosts_to_file!, #search, #sub_domain_known?, #top_hostname

Constructor Details

#initialize(params = {}) ⇒ PrimaryHost

Initialize the instance variables



22
23
24
25
26
27
28
29
30
31
32
# File 'lib/wmap/host_tracker/primary_host.rb', line 22

def initialize (params = {})
	@verbose=params.fetch(:verbose, false)
    @data_dir=params.fetch(:data_dir, File.dirname(__FILE__)+'/../../../data/')
	# Set default instance variables
	@hosts_file=params.fetch(:hosts_file, @data_dir + 'prime_hosts')
	# Initialize the instance variables
    File.write(@hosts_file, "") unless File.exist?(@hosts_file)
	@known_hosts=load_known_hosts_from_file(@hosts_file)
	@known_ips=Hash.new
	de_duplicate
end

Instance Attribute Details

#data_dirObject

Returns the value of attribute data_dir.



19
20
21
# File 'lib/wmap/host_tracker/primary_host.rb', line 19

def data_dir
  @data_dir
end

#hosts_fileObject

Returns the value of attribute hosts_file.



19
20
21
# File 'lib/wmap/host_tracker/primary_host.rb', line 19

def hosts_file
  @hosts_file
end

#known_hostsObject

Returns the value of attribute known_hosts.



19
20
21
# File 'lib/wmap/host_tracker/primary_host.rb', line 19

def known_hosts
  @known_hosts
end

#known_ipsObject

Returns the value of attribute known_ips.



19
20
21
# File 'lib/wmap/host_tracker/primary_host.rb', line 19

def known_ips
  @known_ips
end

#verboseObject

Returns the value of attribute verbose.



19
20
21
# File 'lib/wmap/host_tracker/primary_host.rb', line 19

def verbose
  @verbose
end

Instance Method Details

#de_duplicateObject Also known as: deduplicate

Procedures to remove the redundant entries in the primary hosts data repository



103
104
105
106
107
108
109
110
111
112
# File 'lib/wmap/host_tracker/primary_host.rb', line 103

def de_duplicate
	@known_hosts.keys.map do |key|
		ip=@known_hosts[key]
		if @known_ips.key?(ip)
			@known_hosts.delete(key)
		else
			@known_ips[ip]=true
		end
	end
end

#prime(host) ⇒ Object

Method to replace hostname with known primary hostname



116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/wmap/host_tracker/primary_host.rb', line 116

def prime (host)
	begin
		raise "Unknown hostname format: #{host}" unless is_fqdn?(host)
		ip=local_host_2_ip(host)
		ip=host_2_ip(host) if ip.nil?
		if @known_ips.key?(ip)
			return @known_hosts[ip]
		end
		return host
	rescue Exception => ee
		puts "Exception on method #{__method__}: #{ee}" if @verbose
		return host
	end
end

#update_from_site_redirections!Object

Procedures to identify primary host-name from the site store redirection URLs. The assumption is that on site redirection, it must be directed to the well known primary site.



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/wmap/host_tracker/primary_host.rb', line 75

def update_from_site_redirections!
	puts "Invoke internal procedures to update the primary host-name table from the site store."
	begin
		my_tracker=Wmap::SiteTracker.instance
      my_tracker.sites_file=@data_dir + "sites"
      my_tracker.load_site_stores_from_file
      urls = my_tracker.get_redirection_urls
      my_tracker = nil
		urls.map do |url|
			if is_url?(url)
				host=url_2_host(url)
				if is_fqdn?(host)
					ip=host_2_ip(host)
					# Add duplication check
					unless @known_hosts.key?(ip)
						self.add(host)
					end
				end
			end
		end
		self.save!
	rescue Exception => ee
		puts "Exception on method #{__method__}: #{ee}" if @verbose
		return nil
	end
end

#update_from_site_store!Object Also known as: update!

Procedures to identify primary host-name from the site store SSL certificates. The assumption is that the CN used in the cert application must be primary hostname and used by the users.



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/wmap/host_tracker/primary_host.rb', line 35

def update_from_site_store!
	#begin
      puts "Invoke internal procedures to update the primary host-name table from the site store."
      # Step 1 - update the prime host table based on the SSL cert CN fields
		cns=Hash.new
		checker=Wmap::UrlChecker.new(:data_dir=>@data_dir)
      my_tracker = Wmap::SiteTracker.instance
      my_tracker.sites_file = @data_dir + "sites"
      my_tracker.load_site_stores_from_file
		my_tracker.get_ssl_sites.map do |site|
			puts "Exam SSL enabled site entry #{site} ..."
			my_host=url_2_host(site)
			next if @known_hosts.key?(my_host) # add the logic to optimize the process
			puts "Pull SSL cert details on site: #{site}"
			cn=checker.get_cert_cn(site)
			unless cn.nil? or cns.key?(cn)
				cns[cn]=true
			end
		end
		cns.keys.map do |cn|
			if is_fqdn?(cn)
				next if @known_hosts.key?(cn)
				self.add(cn)
				puts "New entry added: #{cn}\t#{@known_hosts[cn]}"
			end
		end
		# Step 2 - Save the cache into the file
		self.save!
      checker=nil
      my_tracker=nil
	#rescue Exception => ee
	#	puts "Exception on method #{__method__}: #{ee}" if @verbose
    #  checker=nil
    #  my_tracker=nil
	#	return nil
	#end
end