Class: CommonCrawlIndex::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/common-crawl-index.rb

Constant Summary collapse

HEADER_OFFSET =
8
@@settings =
{
  :access_key_id => nil,
  :secret_access_key => nil,
  :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792"
}

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(access_key_id = nil, secret_access_key = nil, cc_index_path = nil) ⇒ Client

Returns a new instance of Client.

Raises:

  • (ArgumentError)


20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/common-crawl-index.rb', line 20

def initialize(access_key_id=nil, secret_access_key=nil, cc_index_path = nil)
  @s3=AWS::S3.new(
    :access_key_id => access_key_id || @@settings[:access_key_id],
    :secret_access_key => secret_access_key || @@settings[:secret_access_key]
  )

  @cc_index_path = cc_index_path || @@settings[:cc_index_path]

  proto,unused,@bucket_name,*rest=@cc_index_path.chomp.split File::SEPARATOR
  raise ArgumentError, "#{__FILE__}: Unknown S3 Protocol #{proto}" unless proto=~/^s3/
  @object_name=File.join rest

  @block_size, @index_block_count = read( (0..7) ).unpack("LL")
end

Class Method Details

.config(settings = {}) ⇒ Object



14
15
16
# File 'lib/common-crawl-index.rb', line 14

def self.config(settings = {})
  @@settings = @@settings.merge(settings)
end

.denormalize_url(normalized_url, has_scheme = true) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/common-crawl-index.rb', line 53

def self.denormalize_url(normalized_url, has_scheme = true)
  scheme = "http"
  colon_index = 0
  if has_scheme
    colon_index = normalized_url.rindex(":")
    scheme = normalized_url[colon_index+1..-1] if colon_index
  end
  url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
  uri = Addressable::URI.parse(url_with_scheme)
  uri.host = uri.host.split(".").reverse.join(".")
  uri.to_s
end

.normalize_url(url, append_scheme = true) ⇒ Object



43
44
45
46
47
48
49
50
51
# File 'lib/common-crawl-index.rb', line 43

def self.normalize_url(url, append_scheme = true)
  url_to_find = url
  norm_url_to_find = Addressable::URI.parse(url_to_find)
  norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
  norm_url = norm_url_to_find.to_s
  norm_url = norm_url[norm_url.index("\/\/")+2..-1]
  norm_url += ":" + norm_url_to_find.scheme if append_scheme
  norm_url
end

Instance Method Details

#find_by_prefix(url, exact_match = false, &proc_block) ⇒ Object



35
36
37
38
39
40
41
# File 'lib/common-crawl-index.rb', line 35

def find_by_prefix(url, exact_match = false, &proc_block)
  next_block = 0
  while next_block < @index_block_count
    next_block = get_next_block_id(url, next_block)
  end
  get_matching_urls_from_data_blocks(next_block, url, exact_match, &proc_block)
end