Class: SiteClassifier::Extractor
- Inherits:
-
Object
- Object
- SiteClassifier::Extractor
- Includes:
- HTTParty
- Defined in:
- lib/site_classifier/extractor.rb
Instance Attribute Summary collapse
-
#description ⇒ Object
Returns the value of attribute description.
-
#lang ⇒ Object
Returns the value of attribute lang.
-
#tags ⇒ Object
Returns the value of attribute tags.
-
#url ⇒ Object
Returns the value of attribute url.
-
#word_frequency ⇒ Object
Returns the value of attribute word_frequency.
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(url, tags, word_hash, description, lang) ⇒ Extractor
constructor
A new instance of Extractor.
-
#most_significant ⇒ Object
Extract most significant tags.
- #to_hash ⇒ Object
-
#validate_lang ⇒ Object
Normalize site language.
Constructor Details
#initialize(url, tags, word_hash, description, lang) ⇒ Extractor
Returns a new instance of Extractor.
7 8 9 10 11 12 13 |
# File 'lib/site_classifier/extractor.rb', line 7 def initialize(url, , word_hash, description, lang) @url = url = @description = description @word_frequency = word_hash @lang = lang.downcase end |
Instance Attribute Details
#description ⇒ Object
Returns the value of attribute description.
5 6 7 |
# File 'lib/site_classifier/extractor.rb', line 5 def description @description end |
#lang ⇒ Object
Returns the value of attribute lang.
5 6 7 |
# File 'lib/site_classifier/extractor.rb', line 5 def lang @lang end |
#tags ⇒ Object
Returns the value of attribute tags.
5 6 7 |
# File 'lib/site_classifier/extractor.rb', line 5 def end |
#url ⇒ Object
Returns the value of attribute url.
5 6 7 |
# File 'lib/site_classifier/extractor.rb', line 5 def url @url end |
#word_frequency ⇒ Object
Returns the value of attribute word_frequency.
5 6 7 |
# File 'lib/site_classifier/extractor.rb', line 5 def word_frequency @word_frequency end |
Class Method Details
.parse_site(url = "") ⇒ Object
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/site_classifier/extractor.rb', line 65 def self.parse_site(url = "") return if url == "" || url.nil? html = Nokogiri::HTML(self.get(url).parsed_response) = [] description = nil word_hash = {} page_lang = "auto" begin page_lang = html.search("html").first["lang"].to_s.slice(0..1) rescue end begin page_lang = html.search("html").first["xml:lang"].to_s.slice(0..1) rescue end begin = html.search('meta[name="keywords"]').first["content"].split(",").collect(&:strip).collect(&:downcase) description = html.search('meta[name="description"]').first["content"] rescue end if .empty? word_hash = Hash.new(0) all_text = html.search("p").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4} if all_text.empty? all_text = html.search("div").collect {|p| p.text.strip }.collect {|text| text.split.collect(&:strip)}.flatten.reject {|word| word.size < 4} end all_text.each do |word| word_hash[word] += 1 end word_hash.reject! {|k,v| v < 2 } end self.new(url, , word_hash, description, page_lang) end |
Instance Method Details
#most_significant ⇒ Object
Extract most significant tags
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/site_classifier/extractor.rb', line 25 def most_significant most_sig = [] if !description.nil? if .any? most_sig = .select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize } else most_sig = word_frequency.keys.select {|tag| self.description.downcase.include?(tag)}.collect {|tag| tag.singularize } end end if most_sig.empty? most_sig = self.word_frequency.keys end self.validate_lang if SiteClassifier. begin if self.lang == "auto" @lang = EasyTranslate.detect(most_sig.first, key: SiteClassifier.configuration.google_translate_api_key) end EasyTranslate.translate(most_sig, from: self.lang, to: :en, key: SiteClassifier.configuration.google_translate_api_key) rescue return most_sig end else return most_sig end end |
#to_hash ⇒ Object
55 56 57 58 59 60 61 62 63 |
# File 'lib/site_classifier/extractor.rb', line 55 def to_hash { most_significant: most_significant, language: self.lang, url: url, tags: , description: description } end |
#validate_lang ⇒ Object
Normalize site language
16 17 18 19 20 21 22 |
# File 'lib/site_classifier/extractor.rb', line 16 def validate_lang if EasyTranslate::LANGUAGES.keys.include?(@lang) @lang else self.lang = "auto" end end |