Class: Url2mhtml

Inherits:
Object
  • Object
show all
Defined in:
lib/url2mhtml.rb

Defined Under Namespace

Classes: ContentInfo

Constant Summary collapse

VERSION =
'0.0.2'

Class Method Summary collapse

Class Method Details

.append_relative_contents(page, content_info_list) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/url2mhtml.rb', line 36

def self.append_relative_contents(page,)
  base_uri=page.uri
  raw_image_uris=page.search('//img').map{|i| i['src']}
  raw_image_uris.push(*(page.search('//body').find_all{|i| i['background']}.map{|i| i['background']}))
  raw_image_uris.push(*(page.search('//th').find_all{|i| i['background']}.map{|i| i['background']}))
  raw_image_uris.push(*(page.search('//td').find_all{|i| i['background']}.map{|i| i['background']}))
  image_uris=raw_image_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/)
 
  raw_frame_uris=page.frames.map{|f| f.uri}
  frame_uris=raw_frame_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/)
 
  raw_iframe_uris=page.iframes.map{|f| f.uri}
  iframe_uris=raw_iframe_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/)

  raw_css_uris=page.search('link[@rel="stylesheet"]').map{|l| l['href']}
  css_uris=raw_css_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/)

  raw_script_uris=page.search('script').find_all{|s| s['src']}.map{|s| s['src']}
  script_uris=raw_script_uris.map{|i| resolve_relative_uri(base_uri,i)}.grep(/^(http|ftp)/)


  raw_urls = ( image_uris + frame_uris + iframe_uris + css_uris + script_uris )
  target_content_urls = raw_urls.map{|u| u.gsub(/#.*/,'')}.uniq.find_all{|u| .any?{|content| u != content.uri}}
 
  target_content_urls.each{|uri| get_contents(uri,false,)}
end

.capture(uri) ⇒ Object



99
100
101
102
103
104
105
# File 'lib/url2mhtml.rb', line 99

def self.capture(uri)
  page_content_list=get_contents(uri,true,[])
  title=page_content_list.find(ContentInfo.new(nil,nil,nil,nil,'no title')) { |content| content.is_root == true }.title
  mail_parts=page_content_list.map{|content| create_mail_part(content)}
  mail=create_mail(title,mail_parts)
  mail.encoded
end

.create_mail(title, parts) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/url2mhtml.rb', line 83

def self.create_mail(title,parts)
  mail=TMail::Mail.new
  mail.from='url2MHTML'
  mail.subject = title
  mail.date = Time.now
  mail.mime_version = '1.0'
  mail['X-MimeOLE']='url2MHTML'

  mail.body = "This is a multi-part message in MIME format.\n"

  parts.each{|part| mail.parts.push(part) }
 
  mail.content_type='multipart/related; type="text/html"'
  mail
end

.create_mail_part(content) ⇒ Object



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/url2mhtml.rb', line 67

def self.create_mail_part(content)
  part=TMail::Mail.new

  part['content-location']=content.uri
  part.content_type = content.type
  if /html/.match(content.type)
    part.transfer_encoding = '8bit'
    part.body = content.body
  else
    part.transfer_encoding = 'base64'
    b64encoded_body = [content.body].pack('m').chomp.gsub(/.{76}/, "\\1\n")
    part.body = b64encoded_body
  end
  part
end

.get_agentObject



9
10
11
12
13
14
# File 'lib/url2mhtml.rb', line 9

def self.get_agent
  user_agent_alias ='Windows IE 6'
  agent = WWW::Mechanize.new
  agent.user_agent_alias =user_agent_alias
  agent
end

.get_content(uri, is_root) ⇒ Object



16
17
18
19
20
21
22
23
24
25
# File 'lib/url2mhtml.rb', line 16

def self.get_content(uri,is_root)
  got_content = get_agent.get(uri)
  type=got_content.response['content-type']
  body=got_content.body
   
  title= ( is_root && /html/.match(type) ) ? got_content.title : 'no title'
   
  =ContentInfo.new(uri,type,body,is_root,title)
  return ,got_content
end

.get_contents(uri, is_root, content_info_list) ⇒ Object



27
28
29
30
31
32
33
34
# File 'lib/url2mhtml.rb', line 27

def self.get_contents(uri,is_root,)
   
  ,got_content=get_content(uri,is_root)
   << 
   
  append_relative_contents(got_content,) if /html/.match(.type)
  
end

.resolve_relative_uri(base_uri, target_uri) ⇒ Object



63
64
65
# File 'lib/url2mhtml.rb', line 63

def self.resolve_relative_uri(base_uri,target_uri)
  URI.join(base_uri.to_s,target_uri).to_s
end