Module: DateSearch
- Defined in:
- lib/paperless/date_search.rb
Constant Summary collapse
- SEP_NOSPACE =
'\.\/\-\,'- SEP =
'\. \/\-\,'- DAY =
'(\d{1,2})'- MONTH =
'([a-zA-Z]{3,15})'- YEAR =
'(\d{4}|\d{2})'- END_DATE =
'(\s|$)'
Instance Method Summary collapse
- #date_search(text, date_locale) ⇒ Object
- #repair_ocr_string(string) ⇒ Object
- #valid_day(num) ⇒ Object
- #valid_month(num) ⇒ Object
- #valid_year(num) ⇒ Object
Instance Method Details
#date_search(text, date_locale) ⇒ Object
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/paperless/date_search.rb', line 51 def date_search(text,date_locale) date = nil if match = text.match(/#{DAY}[#{SEP_NOSPACE}]+#{DAY}[#{SEP_NOSPACE}]+#{YEAR}/) # US: 12-29-2011 # Euro: 29-12-2011 year = valid_year(match[3]) day = date_locale == 'us' ? valid_day(match[2]) : valid_day(match[1]) month = date_locale == 'us' ? valid_month(match[1]) : valid_month(match[2]) if month && day && year puts "Basing the date off the discovered string (3): #{match[0]}" begin date = DateTime.new(year,month,day) rescue puts "WARNING: Unable to create date object. #{$!}" date = nil end else puts "WARNING: The discovered date string does not validate: #{match[0]}" end elsif match = text.match(/#{MONTH}[#{SEP}]{0,3}#{DAY}[#{SEP}]{1,3}#{YEAR}/i) # December 29, 2011 if valid_day(match[2]) && valid_year(match[3]) puts "Basing the date off the discovered string (1): #{match[0]}" begin date = DateTime.parse(repair_ocr_string(match[0])) rescue puts "WARNING: Unable to create date object. #{$!}" date = nil end end elsif match = text.match(/#{DAY}[#{SEP}]{0,3}#{MONTH}[#{SEP}]{0,3}#{YEAR}/i) # 29 December 2011 if valid_day(match[1]) && valid_year(match[3]) puts "Basing the date off the discovered string (2): #{match[0]}" begin date = DateTime.parse(repair_ocr_string(match[0])) rescue puts "WARNING: Unable to create date object. #{$!}" date = nil end end elsif match = text.match(/#{MONTH}[#{SEP}]{0,3}#{YEAR}/i) # December 2011 if valid_year(match[2]) puts "Basing the date off the discovered string (2): #{match[0]}" begin date = DateTime.parse(repair_ocr_string(match[0])) rescue puts "WARNING: Unable to create date object. #{$!}" date = nil end end end date end |
#repair_ocr_string(string) ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 49 |
# File 'lib/paperless/date_search.rb', line 37 def repair_ocr_string(string) string.downcase! prev = '' new_string = '' # I noticed that letters tend to get duplicated during OCR. This tries to fix that. # This only looks at letters since numbers could be duplicated string.each_char {|letter| new_string += letter unless letter == prev && letter.match(/[a-z][A-Z]/) prev = letter } new_string end |
#valid_day(num) ⇒ Object
12 13 14 15 |
# File 'lib/paperless/date_search.rb', line 12 def valid_day(num) day = num.to_i return day <= 31 ? day : nil; end |
#valid_month(num) ⇒ Object
17 18 19 20 |
# File 'lib/paperless/date_search.rb', line 17 def valid_month(num) month = num.to_i return month <= 12 ? month : nil; end |
#valid_year(num) ⇒ Object
22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/paperless/date_search.rb', line 22 def valid_year(num) year = num.to_i now = DateTime.now if year < 100 #transform 2 digit date into 4 digit date now_two_digit_year = now.year - 2000 # In the 1900s? Need to add 1900. Else add 2000 year += year > now_two_digit_year ? 1900 : 2000 end # No file can have a date prior to 1970 return year > 1970 && year <= now.year ? year : nil; end |