Class: WebStat::Fetch

Inherits:

Object

Object
WebStat::Fetch

show all

Defined in:: lib/web_stat/fetch.rb

Direct Known Subclasses

FetchAsHtml, FetchAsWeb

Instance Attribute Summary collapse

#header ⇒ Object

Returns the value of attribute header.
#html ⇒ Object

Returns the value of attribute html.
#nokogiri ⇒ Object

Returns the value of attribute nokogiri.
#status ⇒ Object

Returns the value of attribute status.
#url ⇒ Object

Returns the value of attribute url.
#userdic ⇒ Object

Returns the value of attribute userdic.

Instance Method Summary collapse

#content ⇒ Object

Get main section.
#eyecatch_image_path ⇒ Object

Get temporary path of image.
#get_last_modified ⇒ Object

Return Date or last modified header.
#get_url(url) ⇒ Object

Get url.
#save_local_path(url) ⇒ Object

Get local path to save url.
#site_name ⇒ Object

Get name of domain.
#stat(userdics: nil) ⇒ Object

Get the informations of @url.
#title ⇒ String

Get title.

Instance Attribute Details

#header ⇒ `Object`

Returns the value of attribute header.



3
4
5

# File 'lib/web_stat/fetch.rb', line 3

def header
  @header
end

#html ⇒ `Object`

Returns the value of attribute html.



3
4
5

# File 'lib/web_stat/fetch.rb', line 3

def html
  @html
end

#nokogiri ⇒ `Object`

Returns the value of attribute nokogiri.



3
4
5

# File 'lib/web_stat/fetch.rb', line 3

def nokogiri
  @nokogiri
end

#status ⇒ `Object`

Returns the value of attribute status.



3
4
5

# File 'lib/web_stat/fetch.rb', line 3

def status
  @status
end

#url ⇒ `Object`

Returns the value of attribute url.



3
4
5

# File 'lib/web_stat/fetch.rb', line 3

def url
  @url
end

#userdic ⇒ `Object`

Returns the value of attribute userdic.



3
4
5

# File 'lib/web_stat/fetch.rb', line 3

def userdic
  @userdic
end

Instance Method Details

#content ⇒ `Object`

Get main section



36
37
38

# File 'lib/web_stat/fetch.rb', line 36

def content
  Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content)
end

#eyecatch_image_path ⇒ `Object`

Get temporary path of image

# File 'lib/web_stat/fetch.rb', line 41

def eyecatch_image_path
  # Reuse `path` in this method
  path = nil
  WebStat::Configure.get["eyecatch_image_xpaths"].each do |xpath|
    if @nokogiri.xpath(xpath).first.respond_to?(:value)
      path = @nokogiri.xpath(xpath).first.value
      break
    end
  end
  # If there is a thumbnail rule, apply it.
  WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
    if @url.match(v[0])
      return @url.gsub(v[0], v[1])
    end
  end
  readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content)
  if (path.nil? || path.empty?) && readability_content.xpath('//img').first
    path =  readability_content.xpath('//img').first.attr('src')
  end
  if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first
    path = @nokogiri.xpath('//img').first.attr('src')
  end
  if ! path.nil? && path.match(/^\//)
    "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
  else
    path
  end
end

#get_last_modified ⇒ `Object`

Return Date or last modified header.

Parameters:

url (String)

Returns:

DataTime

# File 'lib/web_stat/fetch.rb', line 122

def get_last_modified
  @header = @header || {}
  if @header.has_key?("date") && @header.has_key?("last-modified")
    if DateTime.parse(@header["date"]) >= DateTime.parse(@header["last-modified"])
      DateTime.parse(@header["date"])
    else
      DateTime.parse(@header["last-modified"])
    end
  elsif @header.has_key?("date")
    DateTime.parse(@header["date"])
  elsif @header.has_key?("last-modified")
    DateTime.parse(@header["last-modified"])
  end
end

#get_url(url) ⇒ `Object`

Get url

Parameters:

url (String)
body (String)

# File 'lib/web_stat/fetch.rb', line 90

def get_url(url)
  mech = Mechanize.new { |_mech| _mech.user_agent = WebStat::Configure.get["user_agent"] }
  # Enable to read Robots.txt
  mech.robots = true
  begin
    if mech.agent.robots_disallowed?(url)
      raise Mechanize::RobotsDisallowedError.new(url)
    end
    document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
    @header = document.header
    begin
      raise 'not_use_chromedirver' unless WebStat::Configure.get["use_chromedirver"]
      body = WebStat::WebDriverHelper.get_source(url)
      @status = 200
    rescue
      if document.class == Mechanize::File
        body = document.body
      else
        body = document.body.encode('UTF-8', document.encoding)
      end
      @status = document.code
    end
  rescue Mechanize::ResponseCodeError => e
    body = e.page.body
    @status = e.page.code
  end
  body
end

#save_local_path(url) ⇒ `Object`

Get local path to save url

Parameters:

url (String)

# File 'lib/web_stat/fetch.rb', line 72

def save_local_path(url)
  return nil if url.nil? || ! url.match(%{^http})
  tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
  agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
  image = agent.get(url)
  File.open(tmp_file, "w+b") do |_file|
    if image.class == Mechanize::File
      _file.puts(image.body)
    elsif image.respond_to?(:body_io)
      _file.puts(image.body_io.read)
    end
  end
  tmp_file
end

#site_name ⇒ `Object`

Get name of domain

# File 'lib/web_stat/fetch.rb', line 23

def site_name
  begin
    site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
  rescue
    site_name = @nokogiri.title
  end
  if site_name.nil?
    "No Sitename"
  else
    site_name.strip
  end
end

#stat(userdics: nil) ⇒ `Object`

Get the informations of @url

Parameters:

Specify (Hash) —

a dictionary for each language code. example ) /*/.dic, “other”: /***/***.dic

# File 'lib/web_stat/fetch.rb', line 139

def stat(userdics: nil)
  clean_content = content.scrub('').gsub(/[\n\t\r　]/, "").gsub(/\s{2,}/, "\s").gsub(URI.regexp, "")
  language_code = CLD.detect_language(clean_content)[:code]
  if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code])
    tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code])
  elsif userdics && userdics.has_key?("other") && File.exists?(userdics["other"])
    tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics["other"])
  else
    tag = WebStat::Tag.new("#{title} #{content}", userdic: WebStat::Configure.get["userdic"])
  end
  {
    title: title,
    site_name: site_name,
    content: clean_content,
    language_code: language_code,
    status: @status,
    url: @url,
    last_modified_at: get_last_modified,
    eyecatch_image_path: save_local_path(eyecatch_image_path),
    tags: tag.nouns
  }
end

#title ⇒ `String`

Get title

Returns:

(String) —

title

# File 'lib/web_stat/fetch.rb', line 6

def title
  begin
    title = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).first
    if title.length < WebStat::Configure.get["min_length_of_meta_title"]
      title = @nokogiri.css("h1").first.content
    end
  rescue
    title = @nokogiri.title
  end
  if title.nil?
    "No Title"
  else
    title.strip
  end
end

Class: WebStat::Fetch

Direct Known Subclasses

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#header ⇒ Object

#html ⇒ Object

#nokogiri ⇒ Object

#status ⇒ Object

#url ⇒ Object

#userdic ⇒ Object

Instance Method Details

#content ⇒ Object

#eyecatch_image_path ⇒ Object

#get_last_modified ⇒ Object

#get_url(url) ⇒ Object

#save_local_path(url) ⇒ Object

#site_name ⇒ Object

#stat(userdics: nil) ⇒ Object

#title ⇒ String

#header ⇒ `Object`

#html ⇒ `Object`

#nokogiri ⇒ `Object`

#status ⇒ `Object`

#url ⇒ `Object`

#userdic ⇒ `Object`

#content ⇒ `Object`

#eyecatch_image_path ⇒ `Object`

#get_last_modified ⇒ `Object`

#get_url(url) ⇒ `Object`

#save_local_path(url) ⇒ `Object`

#site_name ⇒ `Object`

#stat(userdics: nil) ⇒ `Object`

#title ⇒ `String`