Module: DateSearch

Defined in:
lib/paperless/date_search.rb

Constant Summary collapse

SEP_NOSPACE =
'\.\/\-\,'
SEP =
'\. \/\-\,'
DAY =
'(\d{1,2})'
MONTH =
'([a-zA-Z]{3,15})'
YEAR =
'(\d{4}|\d{2})'
END_DATE =
'(\s|$)'

Instance Method Summary collapse

Instance Method Details

#date_search(text, date_locale) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/paperless/date_search.rb', line 51

def date_search(text,date_locale)
  date = nil
  if match = text.match(/#{DAY}[#{SEP_NOSPACE}]+#{DAY}[#{SEP_NOSPACE}]+#{YEAR}/)
    # US:   12-29-2011
    # Euro: 29-12-2011
    year  = valid_year(match[3])
    day   = date_locale == 'us' ? valid_day(match[2])   : valid_day(match[1])
    month = date_locale == 'us' ? valid_month(match[1]) : valid_month(match[2])
    
    if month && day && year
      puts "Basing the date off the discovered string (3): #{match[0]}"
      begin
        date = DateTime.new(year,month,day)
      rescue
        puts "WARNING: Unable to create date object. #{$!}"
        date = nil
      end
    else
      puts "WARNING: The discovered date string does not validate: #{match[0]}"            
    end
  elsif match = text.match(/#{MONTH}[#{SEP}]{0,3}#{DAY}[#{SEP}]{1,3}#{YEAR}/i)
    # December 29, 2011
    if valid_day(match[2]) && valid_year(match[3])
      puts "Basing the date off the discovered string (1): #{match[0]}"
      begin
        date = DateTime.parse(repair_ocr_string(match[0]))
      rescue
        puts "WARNING: Unable to create date object. #{$!}"
        date = nil
      end
    end
  elsif match = text.match(/#{DAY}[#{SEP}]{0,3}#{MONTH}[#{SEP}]{0,3}#{YEAR}/i)
    # 29 December 2011
    if valid_day(match[1]) && valid_year(match[3])
      puts "Basing the date off the discovered string (2): #{match[0]}"
      begin
        date = DateTime.parse(repair_ocr_string(match[0]))
      rescue
        puts "WARNING: Unable to create date object. #{$!}"
        date = nil
      end
    end
  elsif match = text.match(/#{MONTH}[#{SEP}]{0,3}#{YEAR}/i)
    # December 2011
    if valid_year(match[2])
      puts "Basing the date off the discovered string (2): #{match[0]}"
      begin
        date = DateTime.parse(repair_ocr_string(match[0]))
      rescue
        puts "WARNING: Unable to create date object. #{$!}"
        date = nil
      end
    end
  end
  date
end

#repair_ocr_string(string) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/paperless/date_search.rb', line 37

def repair_ocr_string(string)
  string.downcase!
  prev = ''
  new_string = ''

  # I noticed that letters tend to get duplicated during OCR. This tries to fix that.
  # This only looks at letters since numbers could be duplicated
  string.each_char {|letter|
    new_string += letter unless letter == prev && letter.match(/[a-z][A-Z]/)
    prev = letter
  }
  new_string
end

#valid_day(num) ⇒ Object



12
13
14
15
# File 'lib/paperless/date_search.rb', line 12

def valid_day(num)
  day = num.to_i
  return day <= 31 ? day : nil;
end

#valid_month(num) ⇒ Object



17
18
19
20
# File 'lib/paperless/date_search.rb', line 17

def valid_month(num)
  month = num.to_i
  return month <= 12 ? month : nil;
end

#valid_year(num) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/paperless/date_search.rb', line 22

def valid_year(num)
  year = num.to_i
  now = DateTime.now

  if year < 100
     #transform 2 digit date into 4 digit date
    now_two_digit_year = now.year - 2000
    # In the 1900s? Need to add 1900. Else add 2000
    year += year > now_two_digit_year ? 1900 : 2000
  end

   # No file can have a date prior to 1970
  return year > 1970 && year <= now.year ? year : nil;
end