Class: WebStat::Fetch
- Inherits:
-
Object
- Object
- WebStat::Fetch
- Defined in:
- lib/web_stat/fetch.rb
Direct Known Subclasses
Instance Attribute Summary collapse
-
#header ⇒ Object
Returns the value of attribute header.
-
#html ⇒ Object
Returns the value of attribute html.
-
#nokogiri ⇒ Object
Returns the value of attribute nokogiri.
-
#status ⇒ Object
Returns the value of attribute status.
-
#url ⇒ Object
Returns the value of attribute url.
-
#userdic ⇒ Object
Returns the value of attribute userdic.
Instance Method Summary collapse
-
#content ⇒ Object
Get main section.
-
#eyecatch_image_path ⇒ Object
Get temporary path of image.
-
#get_last_modified ⇒ Object
Return Date or last modified header.
-
#get_url(url) ⇒ Object
Get url.
-
#save_local_path(url) ⇒ Object
Get local path to save url.
-
#site_name ⇒ Object
Get name of domain.
-
#stat(userdics: nil) ⇒ Object
Get the informations of @url.
-
#title ⇒ String
Get title.
Instance Attribute Details
#header ⇒ Object
Returns the value of attribute header.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def header @header end |
#html ⇒ Object
Returns the value of attribute html.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def html @html end |
#nokogiri ⇒ Object
Returns the value of attribute nokogiri.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def nokogiri @nokogiri end |
#status ⇒ Object
Returns the value of attribute status.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def status @status end |
#url ⇒ Object
Returns the value of attribute url.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def url @url end |
#userdic ⇒ Object
Returns the value of attribute userdic.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def userdic @userdic end |
Instance Method Details
#content ⇒ Object
Get main section
36 37 38 |
# File 'lib/web_stat/fetch.rb', line 36 def content Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content) end |
#eyecatch_image_path ⇒ Object
Get temporary path of image
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/web_stat/fetch.rb', line 41 def eyecatch_image_path # Reuse `path` in this method path = nil WebStat::Configure.get["eyecatch_image_xpaths"].each do |xpath| if @nokogiri.xpath(xpath).first.respond_to?(:value) path = @nokogiri.xpath(xpath).first.value break end end # If there is a thumbnail rule, apply it. WebStat::Configure.get["thumbnail_regex"].each do |provider, v| if @url.match(v[0]) return @url.gsub(v[0], v[1]) end end readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content) if (path.nil? || path.empty?) && readability_content.xpath('//img').first path = readability_content.xpath('//img').first.attr('src') end if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first path = @nokogiri.xpath('//img').first.attr('src') end if ! path.nil? && path.match(/^\//) "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}" else path end end |
#get_last_modified ⇒ Object
Return Date or last modified header.
122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# File 'lib/web_stat/fetch.rb', line 122 def get_last_modified @header = @header || {} if @header.has_key?("date") && @header.has_key?("last-modified") if DateTime.parse(@header["date"]) >= DateTime.parse(@header["last-modified"]) DateTime.parse(@header["date"]) else DateTime.parse(@header["last-modified"]) end elsif @header.has_key?("date") DateTime.parse(@header["date"]) elsif @header.has_key?("last-modified") DateTime.parse(@header["last-modified"]) end end |
#get_url(url) ⇒ Object
Get url
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# File 'lib/web_stat/fetch.rb', line 90 def get_url(url) mech = Mechanize.new { |_mech| _mech.user_agent = WebStat::Configure.get["user_agent"] } # Enable to read Robots.txt mech.robots = true begin if mech.agent.robots_disallowed?(url) raise Mechanize::RobotsDisallowedError.new(url) end document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'}) @header = document.header begin raise 'not_use_chromedirver' unless WebStat::Configure.get["use_chromedirver"] body = WebStat::WebDriverHelper.get_source(url) @status = 200 rescue if document.class == Mechanize::File body = document.body else body = document.body.encode('UTF-8', document.encoding) end @status = document.code end rescue Mechanize::ResponseCodeError => e body = e.page.body @status = e.page.code end body end |
#save_local_path(url) ⇒ Object
Get local path to save url
72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/web_stat/fetch.rb', line 72 def save_local_path(url) return nil if url.nil? || ! url.match(%{^http}) tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}" agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] } image = agent.get(url) File.open(tmp_file, "w+b") do |_file| if image.class == Mechanize::File _file.puts(image.body) elsif image.respond_to?(:body_io) _file.puts(image.body_io.read) end end tmp_file end |
#site_name ⇒ Object
Get name of domain
23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/web_stat/fetch.rb', line 23 def site_name begin site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last rescue site_name = @nokogiri.title end if site_name.nil? "No Sitename" else site_name.strip end end |
#stat(userdics: nil) ⇒ Object
Get the informations of @url
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/web_stat/fetch.rb', line 139 def stat(userdics: nil) clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s").gsub(URI.regexp, "") language_code = CLD.detect_language(clean_content)[:code] if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code]) tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code]) elsif userdics && userdics.has_key?("other") && File.exists?(userdics["other"]) tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics["other"]) else tag = WebStat::Tag.new("#{title} #{content}", userdic: WebStat::Configure.get["userdic"]) end { title: title, site_name: site_name, content: clean_content, language_code: language_code, status: @status, url: @url, last_modified_at: get_last_modified, eyecatch_image_path: save_local_path(eyecatch_image_path), tags: tag.nouns } end |
#title ⇒ String
Get title
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/web_stat/fetch.rb', line 6 def title begin title = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).first if title.length < WebStat::Configure.get["min_length_of_meta_title"] title = @nokogiri.css("h1").first.content end rescue title = @nokogiri.title end if title.nil? "No Title" else title.strip end end |