Class: TdStatementExtractor::Statement
- Inherits:
-
Object
- Object
- TdStatementExtractor::Statement
- Defined in:
- lib/td_statement_extractor/statement.rb
Defined Under Namespace
Classes: GhostscriptNotInstalledError, InvalidDayError, InvalidMonthError, InvalidStatementDateError, MissingAmountError, MissingDateError, MissingDescriptionError
Constant Summary collapse
- STATEMENT_DATE =
/(?<statement_date>(?<month>[A-Z][a-z]+)\s(?<day>[0-9]+),\s(?<year>[0-9]{4}))/
- MONTH =
/(?<month>JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)/
- DAY =
/(?<day>[0-9]+)/
- DATE =
/(?<date>#{MONTH}\s#{DAY})/
- AMOUNT =
/(?<amount>-?\$[,\d]+\.\d+)/
- DESCRIPTION =
/#{DATE}\s+#{DATE}?\s+(?<description>.+)\s+#{AMOUNT}/
Instance Attribute Summary collapse
-
#debug_mode ⇒ Object
Returns the value of attribute debug_mode.
-
#input_file_path ⇒ Object
Returns the value of attribute input_file_path.
-
#pdf ⇒ Object
Returns the value of attribute pdf.
-
#statement_date ⇒ Object
Returns the value of attribute statement_date.
-
#temp_file_path ⇒ Object
Returns the value of attribute temp_file_path.
-
#text ⇒ Object
Returns the value of attribute text.
-
#transactions ⇒ Object
Returns the value of attribute transactions.
Class Method Summary collapse
Instance Method Summary collapse
- #extract_statement_date ⇒ Object
- #import_pdf ⇒ Object
-
#initialize(input_file_path, debug_mode = false) ⇒ Statement
constructor
A new instance of Statement.
- #output_csv(output_path) ⇒ Object
- #pre_process_pdf ⇒ Object
- #total_activity ⇒ Object
- #transaction_from_line(line) ⇒ Object
- #transaction_line?(line) ⇒ Boolean
- #transform_date(date, statement_date) ⇒ Object
Constructor Details
#initialize(input_file_path, debug_mode = false) ⇒ Statement
Returns a new instance of Statement.
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/td_statement_extractor/statement.rb', line 16 def initialize(input_file_path, debug_mode = false) @input_file_path = input_file_path @debug_mode = debug_mode self.class.check_for_ghostscript pre_process_pdf import_pdf extract_statement_date @transactions = text.each_line.map do |line| puts "#{transaction_line?(line) ? "T" : "F"} - #{line.lstrip.strip}" if @debug_mode && line.match?(/\w+/) next unless transaction_line?(line) data = transaction_from_line(line) data[:date] = transform_date(data[:date], statement_date) data end.compact ensure File.delete(temp_file_path) unless temp_file_path.nil? end |
Instance Attribute Details
#debug_mode ⇒ Object
Returns the value of attribute debug_mode.
14 15 16 |
# File 'lib/td_statement_extractor/statement.rb', line 14 def debug_mode @debug_mode end |
#input_file_path ⇒ Object
Returns the value of attribute input_file_path.
14 15 16 |
# File 'lib/td_statement_extractor/statement.rb', line 14 def input_file_path @input_file_path end |
#pdf ⇒ Object
Returns the value of attribute pdf.
14 15 16 |
# File 'lib/td_statement_extractor/statement.rb', line 14 def pdf @pdf end |
#statement_date ⇒ Object
Returns the value of attribute statement_date.
14 15 16 |
# File 'lib/td_statement_extractor/statement.rb', line 14 def statement_date @statement_date end |
#temp_file_path ⇒ Object
Returns the value of attribute temp_file_path.
14 15 16 |
# File 'lib/td_statement_extractor/statement.rb', line 14 def temp_file_path @temp_file_path end |
#text ⇒ Object
Returns the value of attribute text.
14 15 16 |
# File 'lib/td_statement_extractor/statement.rb', line 14 def text @text end |
#transactions ⇒ Object
Returns the value of attribute transactions.
14 15 16 |
# File 'lib/td_statement_extractor/statement.rb', line 14 def transactions @transactions end |
Class Method Details
.check_for_ghostscript ⇒ Object
105 106 107 |
# File 'lib/td_statement_extractor/statement.rb', line 105 def self.check_for_ghostscript raise GhostscriptNotInstalledError, "Please install Ghostscript. See docs for more info." if `which gs`.empty? end |
Instance Method Details
#extract_statement_date ⇒ Object
52 53 54 55 |
# File 'lib/td_statement_extractor/statement.rb', line 52 def extract_statement_date @statement_date = text.match(STATEMENT_DATE)&.[](:statement_date) raise InvalidStatementDateError, "Unable to extract statement date" if statement_date.nil? || statement_date.empty? end |
#import_pdf ⇒ Object
47 48 49 50 |
# File 'lib/td_statement_extractor/statement.rb', line 47 def import_pdf @pdf = PDF::Reader.new(temp_file_path) @text = pdf.pages.map { |page| page.text }.join end |
#output_csv(output_path) ⇒ Object
97 98 99 100 101 102 103 |
# File 'lib/td_statement_extractor/statement.rb', line 97 def output_csv(output_path) CSV.open(output_path, "a") do |csv| transactions.each do |transaction| csv << [transaction[:date].strftime("%d/%m/%Y"), transaction[:description], transaction[:amount]] end end end |
#pre_process_pdf ⇒ Object
38 39 40 41 42 43 44 45 |
# File 'lib/td_statement_extractor/statement.rb', line 38 def pre_process_pdf @temp_file_path = File.join(File.dirname(@input_file_path), "td_statement_temp_#{Time.now.to_i}.pdf") # Use Ghostscript to decrypt and decompress the PDF. Also remove # all images and crop the margins to remove watermarking that interferes # with the scraping process `gs -o #{@temp_file_path} -sDEVICE=pdfwrite -dFILTERVECTOR -dFILTERIMAGE -g5400x7200 -c "<</PageOffset [-36 -36]>> setpagedevice" -f #{@input_file_path} 2>&1` end |
#total_activity ⇒ Object
93 94 95 |
# File 'lib/td_statement_extractor/statement.rb', line 93 def total_activity -transactions.reduce(0) { |total, x| total + x[:amount] }.round(2) end |
#transaction_from_line(line) ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 |
# File 'lib/td_statement_extractor/statement.rb', line 61 def transaction_from_line(line) date = line.match(DATE)&.[](:date) amount = -(line.match(AMOUNT)&.[](:amount)&.gsub("$", "")&.gsub(",", "")&.to_f || 0) description = line.match(DESCRIPTION)&.[](:description)&.strip raise MissingDateError, "Error extracting DATE from line: #{line}" if date.nil? || date.empty? raise MissingAmountError, "Error extracting AMOUNT from line: #{line}" if amount.nil? || amount.zero? raise MissingDescriptionError, "Error extracting DESCRIPTION from line: #{line}" if description.nil? || description.empty? {date: date, description: description, amount: amount} end |
#transaction_line?(line) ⇒ Boolean
57 58 59 |
# File 'lib/td_statement_extractor/statement.rb', line 57 def transaction_line?(line) line.match?(DATE) && line.match?(AMOUNT) end |
#transform_date(date, statement_date) ⇒ Object
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/td_statement_extractor/statement.rb', line 73 def transform_date(date, statement_date) month = date.match(MONTH)&.[](:month) day = date.match(DAY)&.[](:day)&.to_i statement_month = statement_date.match(STATEMENT_DATE)&.[](:month) statement_year = statement_date.match(STATEMENT_DATE)&.[](:year)&.to_i raise InvalidMonthError, "Error extracting MONTH from date: #{date}" if month.nil? || month.empty? raise InvalidDayError, "Error extracting DAY from date: #{date}" if day.zero? || day > 31 raise InvalidStatementDateError, "Error extracting MONTH from statement date: #{statement_date}" if statement_month.nil? || statement_month.empty? raise InvalidStatementDateError, "Error extracting YEAR from statement date: #{statement_date}" if statement_year.nil? || statement_year.zero? || statement_year < 1980 || statement_year > 3000 year = if statement_month == "January" && month == "DEC" statement_year - 1 else statement_year end Date.parse("#{month} #{day} #{year}") end |