Class: WebStat::Fetch
- Inherits:
-
Object
- Object
- WebStat::Fetch
- Defined in:
- lib/web_stat/fetch.rb
Direct Known Subclasses
Constant Summary collapse
- THUMBNAIL_REGEXS =
{ :youtube => [ %r{^https://www.youtube.com/watch\?v=([^&]+)}, 'http://img.youtube.com/vi/\1/default.jpg' ] }
Instance Attribute Summary collapse
-
#html ⇒ Object
Returns the value of attribute html.
-
#nokogiri ⇒ Object
Returns the value of attribute nokogiri.
-
#status ⇒ Object
Returns the value of attribute status.
-
#url ⇒ Object
Returns the value of attribute url.
-
#userdic ⇒ Object
Returns the value of attribute userdic.
Instance Method Summary collapse
-
#content ⇒ Object
Get main section.
-
#eyecatch_image_path ⇒ Object
Get temporary path of image.
-
#get_url(url) ⇒ Object
Get url.
-
#save_local_path(url) ⇒ Object
Get local path to save url.
-
#site_name ⇒ Object
Get name of domain.
-
#stat(userdics: nil) ⇒ Object
Get the informations of @url.
-
#title ⇒ String
Get title.
Instance Attribute Details
#html ⇒ Object
Returns the value of attribute html.
9 10 11 |
# File 'lib/web_stat/fetch.rb', line 9 def html @html end |
#nokogiri ⇒ Object
Returns the value of attribute nokogiri.
9 10 11 |
# File 'lib/web_stat/fetch.rb', line 9 def nokogiri @nokogiri end |
#status ⇒ Object
Returns the value of attribute status.
9 10 11 |
# File 'lib/web_stat/fetch.rb', line 9 def status @status end |
#url ⇒ Object
Returns the value of attribute url.
9 10 11 |
# File 'lib/web_stat/fetch.rb', line 9 def url @url end |
#userdic ⇒ Object
Returns the value of attribute userdic.
9 10 11 |
# File 'lib/web_stat/fetch.rb', line 9 def userdic @userdic end |
Instance Method Details
#content ⇒ Object
Get main section
42 43 44 |
# File 'lib/web_stat/fetch.rb', line 42 def content Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content) end |
#eyecatch_image_path ⇒ Object
Get temporary path of image
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/web_stat/fetch.rb', line 47 def eyecatch_image_path # Reuse `path` in this method path = nil WebStat::Configure.get["eyecatch_image_xpaths"].each do |xpath| if @nokogiri.xpath(xpath).first.respond_to?(:value) path = @nokogiri.xpath(xpath).first.value break end end # If there is a thumbnail rule, apply it. THUMBNAIL_REGEXS.each do |provider, v| if @url.match(v[0]) return @url.gsub(v[0], v[1]) end end readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content) if (path.nil? || path.empty?) && readability_content.xpath('//img').first path = readability_content.xpath('//img').first.attr('src') end if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first path = @nokogiri.xpath('//img').first.attr('src') end if ! path.nil? && path.match(/^\//) "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}" else path end end |
#get_url(url) ⇒ Object
Get url
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/web_stat/fetch.rb', line 96 def get_url(url) mech = Mechanize.new { |_mech| _mech.user_agent = WebStat::Configure.get["user_agent"] } # Enable to read Robots.txt mech.robots = true begin if mech.agent.robots_disallowed?(url) raise Mechanize::RobotsDisallowedError.new(url) end if WebStat::Configure.get["use_chromedirver"] body = WebStat::WebDriverHelper.get_source(url) @status = 200 else document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'}) if document.class == Mechanize::File body = document.body else body = document.body.encode('UTF-8', document.encoding) end @status = document.code end rescue Mechanize::ResponseCodeError => e body = e.page.body @status = e.page.code end body end |
#save_local_path(url) ⇒ Object
Get local path to save url
78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/web_stat/fetch.rb', line 78 def save_local_path(url) return nil if url.nil? || ! url.match(%{^http}) tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}" agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] } image = agent.get(url) File.open(tmp_file, "w+b") do |_file| if image.class == Mechanize::File _file.puts(image.body) elsif image.respond_to?(:body) _file.puts(image.body_io.read) end end tmp_file end |
#site_name ⇒ Object
Get name of domain
29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/web_stat/fetch.rb', line 29 def site_name begin site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last rescue site_name = @nokogiri.title end if site_name.nil? "No Sitename" else site_name.strip end end |
#stat(userdics: nil) ⇒ Object
Get the informations of @url
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# File 'lib/web_stat/fetch.rb', line 125 def stat(userdics: nil) clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s").gsub(URI.regexp, "") language_code = CLD.detect_language(clean_content)[:code] if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code]) tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code]) elsif userdics && userdics.has_key?("other") && File.exists?(userdics["other"]) tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics["other"]) else tag = WebStat::Tag.new("#{title} #{content}", userdic: WebStat::Configure.get["userdic"]) end { title: title, site_name: site_name, content: clean_content, language_code: language_code, status: @status, url: @url, eyecatch_image_path: save_local_path(eyecatch_image_path), tags: tag.nouns } end |
#title ⇒ String
Get title
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/web_stat/fetch.rb', line 12 def title begin title = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).first if title.length < WebStat::Configure.get["min_length_of_meta_title"] title = @nokogiri.css("h1").first.content end rescue title = @nokogiri.title end if title.nil? "No Title" else title.strip end end |