Class: WebAnalyticsDiscovery::TNS

Inherits:
Object
  • Object
show all
Includes:
GrabberUtils
Defined in:
lib/web_analytics_discovery/grabber/tns.rb

Constant Summary collapse

MAX_TRIES =
5

Constants included from GrabberUtils

GrabberUtils::AVG_DAYS_IN_MONTH, GrabberUtils::CACHE_DIR, GrabberUtils::USER_AGENT

Instance Method Summary collapse

Methods included from GrabberUtils

#download, #download_file, #mangle_url

Constructor Details

#initializeTNS

Returns a new instance of TNS.



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/web_analytics_discovery/grabber/tns.rb', line 11

def initialize
	# This one requires xlsx2csv utility
	begin
		parser_version = `xlsx2csv --version`
	rescue Errno::ENOENT
		raise 'xlsx2csv not available: unable to run TNS report discovery'
	end

	# And an unzip utility
	begin
		unzip_version = `unzip -v`
	rescue Errno::ENOENT
		raise 'unzip not available: unable to run TNS report discovery'
	end
end

Instance Method Details

#ensure_convert(unzipped) ⇒ Object



94
95
96
97
98
99
100
101
# File 'lib/web_analytics_discovery/grabber/tns.rb', line 94

def ensure_convert(unzipped)
	converted = "#{CACHE_DIR}/#{File.basename(unzipped)}.tsv"
	unless File.exists?(converted)
		system("xlsx2csv -d tab -s 1 '#{unzipped}' >'#{converted}'")
		raise 'Unable to convert TNS report to .tsv' unless $?.exitstatus == 0
	end
	return converted
end

#ensure_unpack(zipped) ⇒ Object



85
86
87
88
89
90
91
92
# File 'lib/web_analytics_discovery/grabber/tns.rb', line 85

def ensure_unpack(zipped)
	unzipped = "#{CACHE_DIR}/tns_#{File.basename(zipped)}.xlsx"
	unless File.exists?(unzipped)
		system("unzip -pq '#{zipped}' *.xlsx >'#{unzipped}'")
		raise 'Unable to unpack TNS report' unless $?.exitstatus == 0
	end
	return unzipped
end

#find_id(url) ⇒ Object



107
108
109
# File 'lib/web_analytics_discovery/grabber/tns.rb', line 107

def find_id(url)
	URI.parse(url).host
end

#parse_reportObject

Parsing TNS report involves the following stages:

  1. Download non-empty “directory” page from their web site

for a current year (keep requesting older years if we keep getting empty output, bail out on HTTP error)

  1. Download first (most recent) report listed on that “directory” page

  2. Unpack (unzip) downloaded report file; it’s a zip that

contains multiple files, including single .xlsx file with raw data.

  1. Convert .xlsx file into something more readable (CSV)

with external utility.

  1. Parse resulting CSV report into memory (it’s relatively

short - as of 2014-10, TNS lists only ~500 sites)



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/web_analytics_discovery/grabber/tns.rb', line 44

def parse_report
	report_url = query_directory
	zipped = download_file(report_url)
	unzipped = ensure_unpack(zipped)
	converted = ensure_convert(unzipped)

	@report = {}
	File.open(converted).each_line { |l|
		c = l.chomp.split(/\t/)

		# Skip headers
		next if c.size < 5

		# Skip table column headers
		next if c[0].empty?

		# Skip generic audience info columns
		next if c[1].empty?

		# Downcase URL and calculate proper monthly visitors
		visitors = (c[2].to_f * 1000).to_i
		url = c[1].downcase.gsub(/ \(сайт\)$/, '')

		@report[url] = visitors
	}
end

#query_directoryObject



73
74
75
76
77
78
79
80
81
82
83
# File 'lib/web_analytics_discovery/grabber/tns.rb', line 73

def query_directory
	y = Date.today.year
	MAX_TRIES.times {
		dir = download("http://www.tns-global.ru/services/media/media-audience/internet/information/?arrFilter_pf%5BYEAR%5D=#{y}&set_filter=%D0%9F%D0%BE%D0%BA%D0%B0%D0%B7%D0%B0%D1%82%D1%8C&set_filter=Y")
		if dir =~ /<a href="(\/services\/media\/media-audience\/internet\/information\/\?download=\d+&date=.*?)">/
			return "http://www.tns-global.ru#{$1}"
		end
		y -= 1
	}
	raise 'Unable to query report directory - not a single report found'
end

#run(url) ⇒ Object



103
104
105
# File 'lib/web_analytics_discovery/grabber/tns.rb', line 103

def run(url)
	run_id(find_id(url))
end

#run_id(id) ⇒ Object



111
112
113
114
115
# File 'lib/web_analytics_discovery/grabber/tns.rb', line 111

def run_id(id)
	parse_report unless @report
	v = @report[id]
	return v ? {:id => id, :visitors_mon => v} : nil
end