Class: Hertools::WebsiteParser
- Inherits:
-
Object
- Object
- Hertools::WebsiteParser
- Defined in:
- lib/hertools/website_parser.rb
Overview
Summary: help to get the website info by a url.
Instance Method Summary collapse
-
#crawl_title_and_favicon_file(url, options = {}) ⇒ Object
Summary: get the title and favicon file by the url of one webpage.
Instance Method Details
#crawl_title_and_favicon_file(url, options = {}) ⇒ Object
Summary: get the title and favicon file by the url of one webpage. Arguments url: the url of a webpage options:
html_parser: %w[httparty nokogiri net_http]
root_path: existing file directory
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/hertools/website_parser.rb', line 19 def crawl_title_and_favicon_file(url, = {}) puts '>>> Parsing the arguments <<<' unless parse_url(url) puts 'Failed because the bad url!' return false end unless () puts 'Failed because the bad options!' return false end puts '>>> Analysing the http response <<<' case @html_parser when 'nokogiri' response = HTTParty.head(@url) res = begin Nokogiri::HTML(URI.open(url), nil, 'UTF-8') rescue StandardError => e puts e nil end when 'httparty' response = HTTParty.get(@url) res = response.body else response = Net::HTTP.get_response(URI(@url)) res = response.body.force_encoding("utf-8") end puts "HttpCode: #{response.code}" if res.nil? || res.to_s.empty? puts 'No content!' @title = @domain_name @favicon_url = "#{@index_url}/favicon.ico" puts "Use the default favicon url: #{@favicon_url}." else @title = if nokogiri? res.xpath('//head/title')[0]&.content.to_s else res[%r{<title>\n*(.*)\n*</title>}, 1].to_s end if @title.empty? puts 'Not found the title!' puts 'Use the domain name as the title.' @title = @domain_name end unless nokogiri? coder = HTMLEntities.new @title = coder.decode(@title) end puts "Title: #{@title}" @favicon_url = if nokogiri? favicon_links = res.xpath('//head/link[@rel="icon"]') favicon_links.empty? ? '' : favicon_links[0][:href].to_s else res[/<link rel="icon".*href="([^"]+)/, 1].to_s end if @favicon_url.empty? puts 'Not found the favicon url!' @favicon_url = "#{@index_url}/favicon.ico" puts "Use the default favicon url: #{@favicon_url}." else puts "FaviconUrl: #{@favicon_url}" unless @favicon_url.include?('http') if @favicon_url.include?('//') @favicon_url = "#{@protocol}:#{@favicon_url}" puts "Fixed favicon url: #{@favicon_url}" elsif @favicon_url.include?('/') @favicon_url = "#{@index_url}#{@favicon_url}" puts "Fixed favicon url: #{@favicon_url}" else @favicon_url = "#{@index_url}/favicon.ico" puts "Use the default favicon url: #{@favicon_url}." end end end end if @title.empty? && @favicon_url.empty? puts 'Failed because not found the title and favicon url!' return false end identifier = Digest::MD5.hexdigest(@url) file_directory_path = "#{@root_path}/#{@domain_name}_#{identifier}" puts "FileDirectory: #{file_directory_path}" Dir.mkdir(file_directory_path) unless File.directory?(file_directory_path) if File.directory?(file_directory_path) unless @title.empty? info_file_path = "#{file_directory_path}/website_info.txt" puts "InfoFilePath: #{info_file_path}" open(info_file_path, 'wb') { |f| f << "Title: #{@title}" } end unless @favicon_url.empty? favicon_file_suffix = @favicon_url.split('.').last favicon_file_name = Digest::MD5.hexdigest(@favicon_url) + '.' + favicon_file_suffix favicon_file_path = "#{file_directory_path}/#{favicon_file_name}" puts "FaviconFilePath: #{favicon_file_path}" open(favicon_file_path, 'wb') { |f| f << URI.open(@favicon_url).read } end puts 'Finished!' true else puts 'Failed to create the directory!' false end rescue StandardError => e puts e puts 'Failed because the unexpected exception!' false end |