Class: TdStatementExtractor::Statement

Inherits:
Object
  • Object
show all
Defined in:
lib/td_statement_extractor/statement.rb

Defined Under Namespace

Classes: GhostscriptNotInstalledError, InvalidDayError, InvalidMonthError, InvalidStatementDateError, MissingAmountError, MissingDateError, MissingDescriptionError

Constant Summary collapse

STATEMENT_DATE =
/(?<statement_date>(?<month>[A-Z][a-z]+)\s(?<day>[0-9]+),\s(?<year>[0-9]{4}))/
MONTH =
/(?<month>JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)/
DAY =
/(?<day>[0-9]+)/
DATE =
/(?<date>#{MONTH}\s#{DAY})/
AMOUNT =
/(?<amount>-?\$[,\d]+\.\d+)/
DESCRIPTION =
/#{DATE}\s+#{DATE}?\s+(?<description>.+)\s+#{AMOUNT}/

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input_file_path, debug_mode = false) ⇒ Statement

Returns a new instance of Statement.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/td_statement_extractor/statement.rb', line 16

def initialize(input_file_path, debug_mode = false)
  @input_file_path = input_file_path
  @debug_mode = debug_mode

  self.class.check_for_ghostscript

  pre_process_pdf
  import_pdf
  extract_statement_date

  @transactions = text.each_line.map do |line|
    puts "#{transaction_line?(line) ? "T" : "F"} - #{line.lstrip.strip}" if @debug_mode && line.match?(/\w+/)
    next unless transaction_line?(line)
    data = transaction_from_line(line)
    data[:date] = transform_date(data[:date], statement_date)

    data
  end.compact
ensure
  File.delete(temp_file_path) unless temp_file_path.nil?
end

Instance Attribute Details

#debug_modeObject

Returns the value of attribute debug_mode.



14
15
16
# File 'lib/td_statement_extractor/statement.rb', line 14

def debug_mode
  @debug_mode
end

#input_file_pathObject

Returns the value of attribute input_file_path.



14
15
16
# File 'lib/td_statement_extractor/statement.rb', line 14

def input_file_path
  @input_file_path
end

#pdfObject

Returns the value of attribute pdf.



14
15
16
# File 'lib/td_statement_extractor/statement.rb', line 14

def pdf
  @pdf
end

#statement_dateObject

Returns the value of attribute statement_date.



14
15
16
# File 'lib/td_statement_extractor/statement.rb', line 14

def statement_date
  @statement_date
end

#temp_file_pathObject

Returns the value of attribute temp_file_path.



14
15
16
# File 'lib/td_statement_extractor/statement.rb', line 14

def temp_file_path
  @temp_file_path
end

#textObject

Returns the value of attribute text.



14
15
16
# File 'lib/td_statement_extractor/statement.rb', line 14

def text
  @text
end

#transactionsObject

Returns the value of attribute transactions.



14
15
16
# File 'lib/td_statement_extractor/statement.rb', line 14

def transactions
  @transactions
end

Class Method Details

.check_for_ghostscriptObject



105
106
107
# File 'lib/td_statement_extractor/statement.rb', line 105

def self.check_for_ghostscript
  raise GhostscriptNotInstalledError, "Please install Ghostscript. See docs for more info." if `which gs`.empty?
end

Instance Method Details

#extract_statement_dateObject



52
53
54
55
# File 'lib/td_statement_extractor/statement.rb', line 52

def extract_statement_date
  @statement_date = text.match(STATEMENT_DATE)&.[](:statement_date)
  raise InvalidStatementDateError, "Unable to extract statement date" if statement_date.nil? || statement_date.empty?
end

#import_pdfObject



47
48
49
50
# File 'lib/td_statement_extractor/statement.rb', line 47

def import_pdf
  @pdf = PDF::Reader.new(temp_file_path)
  @text = pdf.pages.map { |page| page.text }.join
end

#output_csv(output_path) ⇒ Object



97
98
99
100
101
102
103
# File 'lib/td_statement_extractor/statement.rb', line 97

def output_csv(output_path)
  CSV.open(output_path, "a") do |csv|
    transactions.each do |transaction|
      csv << [transaction[:date].strftime("%d/%m/%Y"), transaction[:description], transaction[:amount]]
    end
  end
end

#pre_process_pdfObject



38
39
40
41
42
43
44
45
# File 'lib/td_statement_extractor/statement.rb', line 38

def pre_process_pdf
  @temp_file_path = File.join(File.dirname(@input_file_path), "td_statement_temp_#{Time.now.to_i}.pdf")

  # Use Ghostscript to decrypt and decompress the PDF. Also remove
  # all images and crop the margins to remove watermarking that interferes
  # with the scraping process
  `gs -o #{@temp_file_path} -sDEVICE=pdfwrite -dFILTERVECTOR -dFILTERIMAGE -g5400x7200 -c "<</PageOffset [-36 -36]>> setpagedevice" -f #{@input_file_path} 2>&1`
end

#total_activityObject



93
94
95
# File 'lib/td_statement_extractor/statement.rb', line 93

def total_activity
  -transactions.reduce(0) { |total, x| total + x[:amount] }.round(2)
end

#transaction_from_line(line) ⇒ Object

Raises:



61
62
63
64
65
66
67
68
69
70
71
# File 'lib/td_statement_extractor/statement.rb', line 61

def transaction_from_line(line)
  date = line.match(DATE)&.[](:date)
  amount = -(line.match(AMOUNT)&.[](:amount)&.gsub("$", "")&.gsub(",", "")&.to_f || 0)
  description = line.match(DESCRIPTION)&.[](:description)&.strip

  raise MissingDateError, "Error extracting DATE from line: #{line}" if date.nil? || date.empty?
  raise MissingAmountError, "Error extracting AMOUNT from line: #{line}" if amount.nil? || amount.zero?
  raise MissingDescriptionError, "Error extracting DESCRIPTION from line: #{line}" if description.nil? || description.empty?

  {date: date, description: description, amount: amount}
end

#transaction_line?(line) ⇒ Boolean

Returns:

  • (Boolean)


57
58
59
# File 'lib/td_statement_extractor/statement.rb', line 57

def transaction_line?(line)
  line.match?(DATE) && line.match?(AMOUNT)
end

#transform_date(date, statement_date) ⇒ Object

Raises:



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/td_statement_extractor/statement.rb', line 73

def transform_date(date, statement_date)
  month = date.match(MONTH)&.[](:month)
  day = date.match(DAY)&.[](:day)&.to_i
  statement_month = statement_date.match(STATEMENT_DATE)&.[](:month)
  statement_year = statement_date.match(STATEMENT_DATE)&.[](:year)&.to_i

  raise InvalidMonthError, "Error extracting MONTH from date: #{date}" if month.nil? || month.empty?
  raise InvalidDayError, "Error extracting DAY from date: #{date}" if day.zero? || day > 31
  raise InvalidStatementDateError, "Error extracting MONTH from statement date: #{statement_date}" if statement_month.nil? || statement_month.empty?
  raise InvalidStatementDateError, "Error extracting YEAR from statement date: #{statement_date}" if statement_year.nil? || statement_year.zero? || statement_year < 1980 || statement_year > 3000

  year = if statement_month == "January" && month == "DEC"
    statement_year - 1
  else
    statement_year
  end

  Date.parse("#{month} #{day} #{year}")
end