Class: SiteClassifier::Extractor

Inherits:
Object
  • Object
show all
Includes:
HTTParty
Defined in:
lib/site_classifier/extractor.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, tags, word_hash, description, lang) ⇒ Extractor

Returns a new instance of Extractor.



7
8
9
10
11
12
13
# File 'lib/site_classifier/extractor.rb', line 7

def initialize(url, tags, word_hash, description, lang)
  @url = url
  @tags = tags
  @description = description
  @word_frequency = word_hash
  @lang = lang.downcase
end

Instance Attribute Details

#descriptionObject

Returns the value of attribute description.



5
6
7
# File 'lib/site_classifier/extractor.rb', line 5

def description
  @description
end

#langObject

Returns the value of attribute lang.



5
6
7
# File 'lib/site_classifier/extractor.rb', line 5

def lang
  @lang
end

#tagsObject

Returns the value of attribute tags.



5
6
7
# File 'lib/site_classifier/extractor.rb', line 5

def tags
  @tags
end

#urlObject

Returns the value of attribute url.



5
6
7
# File 'lib/site_classifier/extractor.rb', line 5

def url
  @url
end

#word_frequencyObject

Returns the value of attribute word_frequency.



5
6
7
# File 'lib/site_classifier/extractor.rb', line 5

def word_frequency
  @word_frequency
end

Class Method Details

.parse_site(url = "") ⇒ Object



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/site_classifier/extractor.rb', line 65

def self.parse_site(url = "")
  return if url == "" || url.nil?

  html = Nokogiri::HTML(self.get(url).parsed_response)

  tags = []
  description = nil
  word_hash = {}
  page_lang = "auto"

  begin
    page_lang = html.search("html").first["lang"].to_s.slice(0..1)
  rescue
  end

  begin
    page_lang = html.search("html").first["xml:lang"].to_s.slice(0..1)
  rescue
  end

  begin
    tags = html.search('meta[name="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase)
    description = html.search('meta[name="description"]').first["content"]
  rescue
  end

  if tags.empty?
    word_hash = Hash.new(0)
    all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
    if all_text.empty?
      all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4}
    end
    all_text.each do |word|
      word_hash[word] += 1
    end
    word_hash.reject! {|k,v| v < 2 }
  end
  self.new(url, tags, word_hash, description, page_lang)
end

Instance Method Details

#most_significantObject

Extract most significant tags



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/site_classifier/extractor.rb', line 25

def most_significant
  most_sig = []
  if !description.nil?
    if tags.any?
      most_sig = tags.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
    else
      most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize }
    end
  end

  if most_sig.empty?
    most_sig = self.word_frequency.keys
  end

  self.validate_lang

  if SiteClassifier.translate_tags?
    begin
      if self.lang == "auto"
        @lang = EasyTranslate.detect(most_sig.first, key: SiteClassifier.configuration.google_translate_api_key)
      end
      EasyTranslate.translate(most_sig, from: self.lang, to: :en, key: SiteClassifier.configuration.google_translate_api_key)
    rescue
      return most_sig
    end
  else
    return most_sig
  end
end

#to_hashObject



55
56
57
58
59
60
61
62
63
# File 'lib/site_classifier/extractor.rb', line 55

def to_hash
  {
    most_significant: most_significant,
    language: self.lang,
    url: url,
    tags: tags,
    description: description
  }
end

#validate_langObject

Normalize site language



16
17
18
19
20
21
22
# File 'lib/site_classifier/extractor.rb', line 16

def validate_lang
  if EasyTranslate::LANGUAGES.keys.include?(@lang)
    @lang
  else
    self.lang = "auto"
  end
end