Class: WebsiteCloner::Parser
- Inherits:
-
Object
- Object
- WebsiteCloner::Parser
- Defined in:
- lib/website_cloner/parser.rb
Instance Method Summary collapse
-
#initialize(downloader) ⇒ Parser
constructor
A new instance of Parser.
- #organize_files ⇒ Object
- #parse_and_download(content, url) ⇒ Object
Constructor Details
#initialize(downloader) ⇒ Parser
Returns a new instance of Parser.
6 7 8 9 |
# File 'lib/website_cloner/parser.rb', line 6 def initialize(downloader) @downloader = downloader @file_mapping = {} end |
Instance Method Details
#organize_files ⇒ Object
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/website_cloner/parser.rb', line 90 def organize_files Dir.glob(File.join(@downloader.output_dir, '**', '*')).each do |file| next if File.directory?(file) relative_path = file.sub(@downloader.output_dir + '/', '') dirname = File.dirname(relative_path) basename = File.basename(relative_path) if dirname.match?(/^[0-9a-f]+$/) new_basename = URI.decode_www_form_component(basename).gsub('%20', '-') new_path = case when new_basename.end_with?('.css') File.join(@downloader.output_dir, 'css', new_basename.gsub(/^[0-9a-f]+_/, '')) when new_basename.end_with?('.js') File.join(@downloader.output_dir, 'js', new_basename.gsub(/^[0-9a-f]+_/, '')) else File.join(@downloader.output_dir, 'assets', new_basename.gsub(/^[0-9a-f]+_/, '')) end FileUtils.mv(file, new_path) @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}" elsif !basename.include?('.') && !dirname.start_with?('css', 'js', 'assets') # This is likely a subpage without an extension new_path = "#{file}.html" FileUtils.mv(file, new_path) @file_mapping["/#{relative_path}"] = "#{new_path.sub(@downloader.output_dir + '/', '')}" end end update_references end |
#parse_and_download(content, url) ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/website_cloner/parser.rb', line 11 def parse_and_download(content, url) doc = Nokogiri::HTML(content) base_uri = URI.parse(url) # Ensure the path is valid and not empty path = base_uri.path.empty? || base_uri.path == '/' ? '/index.html' : base_uri.path # Calculate the depth of the directory structure depth = [path.count('/') - 1, 0].max prefix = '../' * depth new_pages = [] # Download and update image sources doc.css('img').each do |img| src = img['src'] next if src.nil? || src.empty? new_src = @downloader.download_asset(src, 'image') img['src'] = prefix + new_src # Add the correct prefix for assets in subdirs @file_mapping[src] = new_src # Remove srcset attribute to prevent loading from CDN img.remove_attribute('srcset') img.remove_attribute('imagesrcset') # Update sizes attribute if present img['sizes'] = '100vw' if img['sizes'] end # Download and update stylesheet links doc.css('link[rel="stylesheet"]').each do |link| href = link['href'] next if href.nil? || href.empty? new_href = @downloader.download_asset(href, 'css') link['href'] = prefix + new_href # Add the correct prefix for assets in subdirs @file_mapping[href] = new_href end # Download and update script sources doc.css('script').each do |script| src = script['src'] next if src.nil? || src.empty? new_src = @downloader.download_asset(src, 'js') script['src'] = prefix + new_src # Add the correct prefix for assets in subdirs @file_mapping[src] = new_src end # Handle internal links starting with '/' doc.css('a').each do |a| href = a['href'] next if href.nil? || href.empty? # Target only internal links that start with '/' if href.start_with?('/') # Add the new URL to new_pages for downloading before modification new_pages << URI.join(base_uri, href).to_s # Special handling for homepage if href == '/' a['href'] = prefix + 'index.html' else # Remove leading '/' for saving the local file href.sub!(/^\//, '') # Append '.html' if it's missing and not a file download (like .pdf) href += '.html' unless href =~ /\.\w+$/ # Update the href attribute a['href'] = href end end end # Save the updated HTML save_html(doc.to_html, url) new_pages end |