Class: Url2mhtml
- Inherits:
-
Object
- Object
- Url2mhtml
- Defined in:
- lib/url2mhtml.rb
Defined Under Namespace
Classes: ContentInfo
Constant Summary collapse
- VERSION =
'0.0.2'
Class Method Summary collapse
- .append_relative_contents(page, content_info_list) ⇒ Object
- .capture(uri) ⇒ Object
- .create_mail(title, parts) ⇒ Object
- .create_mail_part(content) ⇒ Object
- .get_agent ⇒ Object
- .get_content(uri, is_root) ⇒ Object
- .get_contents(uri, is_root, content_info_list) ⇒ Object
- .resolve_relative_uri(base_uri, target_uri) ⇒ Object
Class Method Details
.append_relative_contents(page, content_info_list) ⇒ Object
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/url2mhtml.rb', line 36 def self.append_relative_contents(page,content_info_list) base_uri=page.uri raw_image_uris=page.search('//img').map{|i| i['src']} raw_image_uris.push(*(page.search('//body').find_all{|i| i['background']}.map{|i| i['background']})) raw_image_uris.push(*(page.search('//th').find_all{|i| i['background']}.map{|i| i['background']})) raw_image_uris.push(*(page.search('//td').find_all{|i| i['background']}.map{|i| i['background']})) image_uris=raw_image_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/) raw_frame_uris=page.frames.map{|f| f.uri} frame_uris=raw_frame_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/) raw_iframe_uris=page.iframes.map{|f| f.uri} iframe_uris=raw_iframe_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/) raw_css_uris=page.search('link[@rel="stylesheet"]').map{|l| l['href']} css_uris=raw_css_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/) raw_script_uris=page.search('script').find_all{|s| s['src']}.map{|s| s['src']} script_uris=raw_script_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/) raw_urls = ( image_uris + frame_uris + iframe_uris + css_uris + script_uris ) target_content_urls = raw_urls.map{|u| u.gsub(/#.*/,'')}.uniq.find_all{|u| content_info_list.any?{|content| u != content.uri}} target_content_urls.each{|uri| get_contents(uri,false,content_info_list)} end |
.capture(uri) ⇒ Object
99 100 101 102 103 104 105 |
# File 'lib/url2mhtml.rb', line 99 def self.capture(uri) page_content_list=get_contents(uri,true,[]) title=page_content_list.find(ContentInfo.new(nil,nil,nil,nil,'no title')) { |content| content.is_root == true }.title mail_parts=page_content_list.map{|content| create_mail_part(content)} mail=create_mail(title,mail_parts) mail.encoded end |
.create_mail(title, parts) ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/url2mhtml.rb', line 83 def self.create_mail(title,parts) mail=TMail::Mail.new mail.from='url2MHTML' mail.subject = title mail.date = Time.now mail.mime_version = '1.0' mail['X-MimeOLE']='url2MHTML' mail.body = "This is a multi-part message in MIME format.\n" parts.each{|part| mail.parts.push(part) } mail.content_type='multipart/related; type="text/html"' mail end |
.create_mail_part(content) ⇒ Object
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/url2mhtml.rb', line 67 def self.create_mail_part(content) part=TMail::Mail.new part['content-location']=content.uri part.content_type = content.type if /html/.match(content.type) part.transfer_encoding = '8bit' part.body = content.body else part.transfer_encoding = 'base64' b64encoded_body = [content.body].pack('m').chomp.gsub(/.{76}/, "\\1\n") part.body = b64encoded_body end part end |
.get_agent ⇒ Object
9 10 11 12 13 14 |
# File 'lib/url2mhtml.rb', line 9 def self.get_agent user_agent_alias ='Windows IE 6' agent = WWW::Mechanize.new agent.user_agent_alias =user_agent_alias agent end |
.get_content(uri, is_root) ⇒ Object
16 17 18 19 20 21 22 23 24 25 |
# File 'lib/url2mhtml.rb', line 16 def self.get_content(uri,is_root) got_content = get_agent.get(uri) type=got_content.response['content-type'] body=got_content.body title= ( is_root && /html/.match(type) ) ? got_content.title : 'no title' content_info=ContentInfo.new(uri,type,body,is_root,title) return content_info,got_content end |
.get_contents(uri, is_root, content_info_list) ⇒ Object
27 28 29 30 31 32 33 34 |
# File 'lib/url2mhtml.rb', line 27 def self.get_contents(uri,is_root,content_info_list) content_info,got_content=get_content(uri,is_root) content_info_list << content_info append_relative_contents(got_content,content_info_list) if /html/.match(content_info.type) content_info_list end |
.resolve_relative_uri(base_uri, target_uri) ⇒ Object
63 64 65 |
# File 'lib/url2mhtml.rb', line 63 def self.resolve_relative_uri(base_uri,target_uri) URI.join(base_uri.to_s,target_uri).to_s end |