Class: TableExtractor
Overview
Extracts Markdown-style tables from text lines and returns metadata about each table
This class analyzes an array of text lines to identify tables formatted in Markdown style. It supports both multi-line tables (using | delimiters) and single-line tables (using ! delimiters). For each table found, it returns metadata including row count, column count, and position.
Class Method Summary collapse
-
.extract_tables(lines, multi_line_delimiter: '|', regexp:, single_line_delimiter: '!') ⇒ Array<Hash>
Extract tables from an array of text lines formatted in Markdown style.
Class Method Details
.extract_tables(lines, multi_line_delimiter: '|', regexp:, single_line_delimiter: '!') ⇒ Array<Hash>
Extract tables from an array of text lines formatted in Markdown style
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/table_extractor.rb', line 30 def self.extract_tables( lines, multi_line_delimiter: '|', regexp:, single_line_delimiter: '!' ) current_column_count = 0 current_row_count = 0 extracted_tables = [] inside_multi_line_table = false table_start_index = nil # Regex patterns for single-line table row parsing single_line_start_pattern = /^\s*#{single_line_delimiter}/ single_line_content_pattern = /(?:^|(?<=#{single_line_delimiter}))\s*([^#{single_line_delimiter}]*)\s*(?=#{single_line_delimiter}|$)/ # Helper method to add the current table to results and reset state add_current_table = lambda do |delimiter| extracted_tables << { column_offset: 1, columns: current_column_count, delimiter: delimiter, rows: current_row_count, start_index: table_start_index } current_column_count = 0 current_row_count = 0 inside_multi_line_table = false table_start_index = nil end lines.each_with_index do |line, line_index| # Detect single-line tables (e.g., !Name!Age!City!) if !inside_multi_line_table && line =~ single_line_start_pattern current_row_count = 1 extracted_columns = line.scan(single_line_content_pattern).flatten table_start_index = line_index current_column_count = extracted_columns.count - 1 add_current_table.call(single_line_delimiter) # Detect multi-line table separator rows (e.g., |---|---|) elsif line.strip.match?(regexp) if inside_multi_line_table # Add the current table before starting a new one add_current_table.call(multi_line_delimiter) end # Start a new multi-line table current_column_count = line.split(multi_line_delimiter).count - 1 current_row_count = 2 # Account for header and separator rows inside_multi_line_table = true table_start_index = line_index - 1 if table_start_index.nil? # Continue multi-line table with data rows elsif inside_multi_line_table && (line.strip.start_with?(multi_line_delimiter) || line.include?(multi_line_delimiter)) current_row_count += 1 # End multi-line table when we encounter a non-table line elsif inside_multi_line_table add_current_table.call(multi_line_delimiter) end end # Handle table that ends at the last line if inside_multi_line_table add_current_table.call(multi_line_delimiter) end extracted_tables end |