Class: WebStat::Fetch
- Inherits:
-
Object
- Object
- WebStat::Fetch
- Defined in:
- lib/web_stat/fetch.rb
Direct Known Subclasses
Instance Attribute Summary collapse
-
#header ⇒ Object
Returns the value of attribute header.
-
#html ⇒ Object
Returns the value of attribute html.
-
#nokogiri ⇒ Object
Returns the value of attribute nokogiri.
-
#status ⇒ Object
Returns the value of attribute status.
-
#url ⇒ Object
Returns the value of attribute url.
-
#userdic ⇒ Object
Returns the value of attribute userdic.
Instance Method Summary collapse
-
#content ⇒ Object
Get main section.
-
#eyecatch_image_path ⇒ Object
Get temporary path of image.
-
#get_last_modified ⇒ Object
Return Date or last modified header.
-
#get_url(url) ⇒ Object
Get url.
-
#save_local_path(url) ⇒ Object
Get local path to save url.
-
#site_name ⇒ Object
Get name of domain.
-
#stat(userdics: nil) ⇒ Object
Get the informations of @url.
-
#title ⇒ String
Get title.
-
#youtube_decscription ⇒ Object
Get describe of youtube movie.
Instance Attribute Details
#header ⇒ Object
Returns the value of attribute header.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def header @header end |
#html ⇒ Object
Returns the value of attribute html.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def html @html end |
#nokogiri ⇒ Object
Returns the value of attribute nokogiri.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def nokogiri @nokogiri end |
#status ⇒ Object
Returns the value of attribute status.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def status @status end |
#url ⇒ Object
Returns the value of attribute url.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def url @url end |
#userdic ⇒ Object
Returns the value of attribute userdic.
3 4 5 |
# File 'lib/web_stat/fetch.rb', line 3 def userdic @userdic end |
Instance Method Details
#content ⇒ Object
Get main section
36 37 38 39 40 41 42 |
# File 'lib/web_stat/fetch.rb', line 36 def content if @url&.match(WebStat::Configure.get["id_extraction_regexs"]["youtube"]) youtube_decscription else Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content) end end |
#eyecatch_image_path ⇒ Object
Get temporary path of image
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/web_stat/fetch.rb', line 57 def eyecatch_image_path # Reuse `path` in this method path = nil WebStat::Configure.get["eyecatch_image_xpaths"].each do |xpath| if @nokogiri.xpath(xpath).first.respond_to?(:value) path = @nokogiri.xpath(xpath).first.value break end end # If there is a thumbnail rule, apply it. WebStat::Configure.get["id_extraction_regexs"].each do |provider, regex_string| if @url.match(regex_string) return @url.gsub(%r{#{regex_string}.*$}, WebStat::Configure.get["thumbnail_regex"][provider]) end end readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content) if (path.nil? || path.empty?) && readability_content.xpath('//img').first path = readability_content.xpath('//img').first.attr('src') end if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first path = @nokogiri.xpath('//img').first.attr('src') end if ! path.nil? && path.match(/^\//) "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}" else path end end |
#get_last_modified ⇒ Object
Return Date or last modified header.
140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/web_stat/fetch.rb', line 140 def get_last_modified @header = @header || {} if @header.has_key?("date") && @header.has_key?("last-modified") if DateTime.parse(@header["date"]) >= DateTime.parse(@header["last-modified"]) DateTime.parse(@header["date"]) else DateTime.parse(@header["last-modified"]) end elsif @header.has_key?("date") DateTime.parse(@header["date"]) elsif @header.has_key?("last-modified") DateTime.parse(@header["last-modified"]) end end |
#get_url(url) ⇒ Object
Get url
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# File 'lib/web_stat/fetch.rb', line 108 def get_url(url) mech = Mechanize.new { |_mech| _mech.user_agent = WebStat::Configure.get["user_agent"] } # Enable to read Robots.txt mech.robots = true begin if mech.agent.robots_disallowed?(url) raise Mechanize::RobotsDisallowedError.new(url) end document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'}) @header = document.header begin raise 'not_use_chromedirver' unless WebStat::Configure.get["use_chromedirver"] body = WebStat::WebDriverHelper.get_source(url) @status = 200 rescue if document.class == Mechanize::File body = document.body else body = document.body.encode('UTF-8', document.encoding) end @status = document.code end rescue Mechanize::ResponseCodeError => e body = e.page.body @status = e.page.code end body end |
#save_local_path(url) ⇒ Object
Get local path to save url
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/web_stat/fetch.rb', line 88 def save_local_path(url) return nil if url.nil? || ! url.match(%{^http}) tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}" agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] } image = agent.get(url) File.open(tmp_file, "w+b") do |_file| if image.class == Mechanize::File _file.puts(image.body) elsif image.respond_to?(:body_io) _file.puts(image.body_io.read) end end tmp_file rescue false end |
#site_name ⇒ Object
Get name of domain
23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/web_stat/fetch.rb', line 23 def site_name begin site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last rescue site_name = @nokogiri.title end if site_name.nil? "No Sitename" else site_name.strip end end |
#stat(userdics: nil) ⇒ Object
Get the informations of @url
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/web_stat/fetch.rb', line 157 def stat(userdics: nil) clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s").gsub(URI.regexp, "") language_code = CLD.detect_language(clean_content)[:code] if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code]) tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code]) elsif userdics && userdics.has_key?("other") && File.exists?(userdics["other"]) tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics["other"]) else tag = WebStat::Tag.new("#{title} #{content}", userdic: WebStat::Configure.get["userdic"]) end { title: title, site_name: site_name, content: clean_content, language_code: language_code, status: @status, url: @url, last_modified_at: get_last_modified, eyecatch_image_path: save_local_path(eyecatch_image_path), tags: tag.nouns } end |
#title ⇒ String
Get title
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/web_stat/fetch.rb', line 6 def title begin title = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).first if title.length < WebStat::Configure.get["min_length_of_meta_title"] title = @nokogiri.css("h1").first.content end rescue title = @nokogiri.title end if title.nil? "No Title" else title.strip end end |
#youtube_decscription ⇒ Object
Get describe of youtube movie.
45 46 47 48 49 50 51 52 53 54 |
# File 'lib/web_stat/fetch.rb', line 45 def youtube_decscription regex_string = WebStat::Configure.get["id_extraction_regexs"]["youtube"] if @url.match(regex_string) id = @url.gsub(%r{#{regex_string}.*$}, '\1') youtube = Google::Apis::YoutubeV3::YouTubeService.new youtube.key = WebStat::Configure.get["api_keys"]["youtube"] response = youtube.list_videos(:snippet, id: id) response.items.first.snippet.description end end |