Module: Searchy

Included in:
Altavista, Bing, Google, GoogleGroups, GoogleProfiles, LinkedIn, Naymz, PGP, Usenet, Yahoo
Defined in:
lib/esearchy/searchy.rb

Instance Method Summary collapse

Instance Method Details

#clean(&block) ⇒ Object



250
251
252
# File 'lib/esearchy/searchy.rb', line 250

def clean( &block )
  @emails.delete_if &block.call
end

#fix(list) ⇒ Object



242
243
244
245
246
247
248
# File 'lib/esearchy/searchy.rb', line 242

def fix(list)
  list.each do |e|
    e.gsub!(" at ","@")
    e.gsub!("_at_","@")
    e.gsub!(" dot ",".")
  end
end

#hash_url(url) ⇒ Object



238
239
240
# File 'lib/esearchy/searchy.rb', line 238

def hash_url(url)
  Digest::SHA2.hexdigest("#{Time.now.to_f}--#{url}")
end

#maxhits=(value) ⇒ Object



254
255
256
# File 'lib/esearchy/searchy.rb', line 254

def maxhits=( value )
  @totalhits = value
end

HELPER METHODS ———————————————————————————



214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/esearchy/searchy.rb', line 214

def print_emails(list)
  list.each do |email|
    unless @emails.include?(email)
      unless RUBY_PLATFORM =~ /mingw|mswin/
        if email.match(/#{@query.gsub("@","").split('.')[0]}/)
          ESearchy::LOG.puts "\033[31m" + email + "\033\[0m"
        else
          ESearchy::LOG.puts "\033[32m" + email + "\033\[0m"
        end
      else
        if email.match(/#{@query.gsub("@","").split('.')[0]}/)
          Wcol::color(12)
          ESearchy::LOG.puts email
          Wcol::color(7)
        else
          Wcol::color(2)
          ESearchy::LOG.puts email
          Wcol::color(7)
        end
      end
    end
  end
end

#search_depthObject



258
259
260
261
262
263
264
265
# File 'lib/esearchy/searchy.rb', line 258

def search_depth
  search_pdfs @r_pdfs if @r_pdfs
  search_txts @r_txts if @r_txts
  search_office_xml @r_officexs if @r_officexs
  if RUBY_PLATFORM =~ /mingw|mswin/
    search_docs @r_docs if @r_docs
  end
end

#search_docs(urls) ⇒ Object



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/esearchy/searchy.rb', line 71

def search_docs(urls)
  while urls.size >= 1
     @threads << Thread.new do
       web = URI.parse(urls.pop.gsub(' ','+'))
       ESearchy::LOG.puts "Searching in DOC: #{web.to_s}\n"
       begin
         http = Net::HTTP.new(web.host,80)
         http.start do |http|
           request = Net::HTTP::Get.new("#{web.path}#{web.query}")
           response = http.request(request)
           case response
           when Net::HTTPSuccess, Net::HTTPRedirection
             name = Searchy::TEMP + "#{hash_url(web.to_s)}.doc"
             open(name, "wb") do |file|
               file.write(response.body)
             end
             if RUBY_PLATFORM =~ /mingw|mswin/
               begin
                 word = WIN32OLE.new('word.application')
                 word.documents.open(name)
                 word.selection.wholestory
                 search_emails(word.selection.text.chomp)
                 word.activedocument.close( false )
                 word.quit
               rescue
                 if File.exists?("C:\\antiword\\antiword.exe")
                   search_emails(`C:\\antiword\\antiword.exe "#{name}" -f -s`)
                 else
                    # This G h e t t o but, for now it works on emails 
                    # that do not contain Capital letters:)
                    ESearchy::LOG.puts "M$ Word|Antiword are not installed. Using the Ghetto way."
                    search_emails( File.open(name).readlines[0..19].to_s )
                 end
               end
              elsif RUBY_PLATFORM =~ /linux|darwin/
                begin
                  if File.exists?("/usr/bin/antiword") or 
                     File.exists?("/usr/local/bin/antiword") or 
                     File.exists?("/opt/local/bin/antiword")
                    search_emails(`antiword "#{name}" -f -s`)
                  else
                    # This G h e t t o but, for now it works on emails 
                    # that do not contain Capital letters:)
                    ESearchy::LOG.puts "Antiword is not installed. Using the Ghetto way."
                    search_emails( File.open(name).readlines[0..19].to_s )
                  end
                rescue
                  ESearchy::LOG.puts "Something went wrong parsing the .doc\n"
                end
              else
                ESearchy::LOG.puts "This platform is not currently supported."
              end
             `rm "#{name}"`
           else
             return response.error!
           end
         end
       rescue Net::HTTPFatalError
         ESearchy::LOG.puts "Error: Something went wrong with the HTTP request.\n"
       rescue Net::HTTPServerException
         ESearchy::LOG.puts "Error: Not longer there. 404 Not Found.\n"
       rescue
         ESearchy::LOG.puts "Error: < .. SocketError .. >\n"
       end
     end
   end
   @threads.each {|t| t.join } if @threads != nil
end

#search_emails(string) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/esearchy/searchy.rb', line 14

def search_emails(string)
  list = string.scan(/[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*_at_\
(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|\
[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]\
*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|[a-z0-9!#$&'*+=?^_`{|}~-]+\
(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|\
[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*\s@\s(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+\
[a-z](?:[a-z-]*[a-z])?|[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\sdot\s[a-z0-9!#$&'*+=?^_`\
{|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\sdot\s)+[a-z](?:[a-z-]*[a-z])??/i)
  @lock.synchronize do
    print_emails(list)
    @emails.concat(fix(list)).uniq!
  end
end

#search_office_xml(urls) ⇒ Object



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# File 'lib/esearchy/searchy.rb', line 140

def search_office_xml(urls)
  while urls.size >= 1
    @threads << Thread.new do
      web = URI.parse(urls.pop.gsub(' ','+'))
      #format = web.scan(/docx|xlsx|pptx/i)[0]
      format = web.scan(/docx|xlsx|pptx|odt|odp|ods|odb/i)[0]
      ESearchy::LOG.puts "Searching in #{format.upcase}: #{web.to_s}\n"
      begin
        http = Net::HTTP.new(web.host,80)
        http.start do |http|
          request = Net::HTTP::Get.new("#{web.path}#{web.query}")
          response = http.request(request)
          case response
          when Net::HTTPSuccess, Net::HTTPRedirection
            name = Searchy::TEMP + "#{hash_url(web.to_s)}." + format
            open(name, "wb") do |file|
              file.write(response.body)
            end
            begin
              Zip::ZipFile.open(name) do |zip|
                text = z.entries.each { |e| zip.file.read(e.name) if e.name =~ /.xml$/}
                search_emails(text)
              end
            rescue
              ESearchy::LOG.puts "Something went wrong parsing the .#{format.downcase}\n"
            end
            `rm "#{name}"`
          else
            return response.error!
          end
        end
      rescue Net::HTTPFatalError
        ESearchy::LOG.puts "Error: Something went wrong with the HTTP request.\n"
      rescue Net::HTTPServerException
        ESearchy::LOG.puts "Error: Not longer there. 404 Not Found.\n"
      rescue
        ESearchy::LOG.puts "Error: < .. SocketError .. >\n"
      end
    end
  end
  @threads.each {|t| t.join } if @threads != nil
end

#search_pdfs(urls) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/esearchy/searchy.rb', line 29

def search_pdfs(urls)
  while urls.size >= 1
    @threads << Thread.new do
      web = URI.parse(urls.pop.gsub(' ','+'))
      ESearchy::LOG.puts "Searching in PDF: #{web.to_s}\n"
      begin
        http = Net::HTTP.new(web.host,80)
        http.start do |http|
          request = Net::HTTP::Get.new("#{web.path}#{web.query}")
          response = http.request(request)
          case response
          when Net::HTTPSuccess, Net::HTTPRedirection
            name = ESearchy::TEMP + "#{hash_url(web.to_s)}.pdf"
            open(name, "wb") do |file|
              file.write(response.body)
            end
            begin
              receiver = PageTextReceiver.new
              pdf = PDF::Reader.file(name, receiver)
              search_emails(receiver.content.inspect)
            rescue PDF::Reader::UnsupportedFeatureError
              ESearchy::LOG.puts "Encrypted PDF: Unable to parse it.\n"
            rescue PDF::Reader::MalformedPDFError
              ESearchy::LOG.puts "Malformed PDF: Unable to parse it.\n"
            end
            `rm "#{name}"`
          else
            return response.error!
          end
        end
      rescue Net::HTTPFatalError
        ESearchy::LOG.puts "Error: Something went wrong with the HTTP request.\n"
      rescue Net::HTTPServerException
        ESearchy::LOG.puts "Error: Not longer there. 404 Not Found.\n"
      rescue
        ESearchy::LOG.puts "Error: < .. SocketError .. >\n"
      end
    end
  end
  @threads.each {|t| t.join } if @threads != nil
end

#search_txts(urls) ⇒ Object



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/esearchy/searchy.rb', line 183

def search_txts(urls)
  while urls.size >= 1
    @threads << Thread.new do 
      web = URI.parse(urls.pop.gsub(' ','+'))
      ESearchy::LOG.puts "Searching in #{web.to_s.scan(/txt|rtf|ans/i)[0].upcase}: #{web.to_s}\n"
      begin
        http = Net::HTTP.new(web.host,80)
        http.start do |http|
          request = Net::HTTP::Get.new("#{web.path}#{web.query}")
          response = http.request(request)
          case response
          when Net::HTTPSuccess, Net::HTTPRedirection
            search_emails(response.body)
          else
            return response.error!
          end
        end
      rescue Net::HTTPFatalError
        ESearchy::LOG.puts "Error: Something went wrong with the HTTP request\n"
      rescue Net::HTTPServerException
        ESearchy::LOG.puts "Error: Not longer there. 404 Not Found.\n"
      rescue
        ESearchy::LOG.puts "Error: < .... >"
      end
    end
  end
  @threads.each {|t| t.join } if @threads != nil
end