Class: CodeZauker::Util

Inherits:
Object
  • Object
show all
Defined in:
lib/code_zauker.rb

Overview

Basic utility class

Instance Method Summary collapse

Instance Method Details

#ensureUTF8(untrusted_string) ⇒ Object

Ensure Data are correctly imported

blog.grayproductions.net/articles/ruby_19s_string This code try to “guess” the right encoding switching to ISO-8859-1 if UTF-8 is not valid. Tipical use case: an italian source code wronlgy interpreted as a UTF-8 whereas it is a ISO-8859 windows code.



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/code_zauker.rb', line 63

def ensureUTF8(untrusted_string)
  if untrusted_string.valid_encoding?()==false 
    #puts "DEBUG Trouble on #{untrusted_string}"
    untrusted_string.force_encoding("ISO-8859-1")        
    # We try ISO-8859-1 tipical windows 
    begin
      valid_string=untrusted_string.encode("UTF-8", { :undef =>:replace, :invalid => :replace} )           
    rescue Encoding::InvalidByteSequenceError => e   
      raise e
    end
    # if valid_string != untrusted_string
    #   puts "CONVERTED #{valid_string} Works?#{valid_string.valid_encoding?}"
    # end
    return valid_string
  else
    return untrusted_string
  end
end

#get_lines(filename) ⇒ Object

Obtain lines from a filename It works even with pdf files



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/code_zauker.rb', line 88

def get_lines(filename)
  lines=[]
  if self.is_pdf?(filename)
    # => enable pdf processing....
    #puts "PDF..."
    File.open(filename, "rb") do |io|
      reader = PDF::Reader.new(io)
      #puts "PDF Scanning...#{reader.info}"
      reader.pages.each do |page|
        linesToTrim=page.text.split("\n")
        linesToTrim.each do |l|
          lines.push(l.strip())
        end
      end
      #puts "PDF Lines:#{lines.length}"
    end
  else
    File.open(filename,"r") { |f|
      lines=f.readlines()        
    }
  end
  return lines
end

#is_pdf?(filename) ⇒ Boolean

Returns:

  • (Boolean)


82
83
84
# File 'lib/code_zauker.rb', line 82

def is_pdf?(filename)
  return filename.downcase().end_with?(".pdf")
end

#mixCase(trigram) ⇒ Object

Compute all the possible case-mixed trigrams It works for every string size TODO: Very bad implementation, need improvements



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/code_zauker.rb', line 26

def mixCase(trigram) 
  caseMixedElements=[]
  lx=trigram.length
  combos=2**lx
  startString=trigram.downcase
  #puts "Combos... 1..#{combos}... #{startString}"
  for c in 0..(combos-1) do
    # Make binary
    maskForStuff=c.to_s(2)
    p=0
    #puts maskForStuff
    currentMix=""
    # Pad it
    if maskForStuff.length < lx
      maskForStuff = ("0"*(lx-maskForStuff.length)) +maskForStuff
    end        
    maskForStuff.each_char { | x |          
      #putc x
      if x=="1"
        currentMix +=startString[p].upcase
      else
        currentMix +=startString[p].downcase
      end
      #puts currentMix
      p+=1
    }        
    caseMixedElements.push(currentMix)
  end
  return caseMixedElements
end