Method: Fech::MapGenerator.convert_header_file_to_row_files

Defined in:
lib/fech/map_generator.rb

.convert_header_file_to_row_files(source_dir) ⇒ Object

Goes through all version header summary files and generates row map files for each type of row inside them.



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/fech/map_generator.rb', line 67

def self.convert_header_file_to_row_files(source_dir)
  data = {}
  hybrid_data = {}
  
  ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
  
  # Create a hash of data with an entry for each row type found in the source
  # version summary files. Each row has an entry for each version map that
  # exists for it. If maps for two different versions are identical, they
  # are combined.
  FILING_VERSIONS.each do |version|
    filepath = version_summary_file(source_dir, version)

    # Clean the source files by removing unparseable characters
    if RUBY_VERSION < "1.9.3"
      require 'iconv'
      ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
      valid_string = ic.iconv(open(filepath).read << ' ')[0..-2]
    else
      valid_string = (open(filepath).read << ' ')[0..-2].encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
      valid_string = valid_string.encode!('UTF-8', 'UTF-16')
    end
    open(filepath, 'w').write(valid_string)

    Fech::Csv.foreach(filepath) do |row|
      # Each row of a version summary file contains the ordered list of
      # column names.
      data[row.first] ||= {}
      hybrid_data[row.first] ||= {}
      row_version_data = remove_ignored_fields(row, ignored_fields)

      # Check the maps for this row type in already-processed versions.
      # If this map is identical to a previous map, tack this version on to
      # to it instead of creating a new one.
      data[row.first][version] = row_version_data
      data[row.first].each do |k, v|
        # skip the row we just added
        
        next if k == version
        if v == row_version_data
          # Create the new hybrid entry
          hybrid_data[row.first]["#{k}|#{version}"] = row_version_data
          
          # Delete the old entry, and the one for this version only
          data[row.first].delete(k)
          data[row.first].delete(version)
        end
      end
      data[row.first].update(hybrid_data[row.first])
    end
  end
  
  # Go through each row type and create a base map management file that
  # will serve as a template for organizing which fields are the same
  # between versions. This file will need to then be arranged by hand to
  # clean up the data. Each row will represent a column across versions,
  # each column a unique map for that row for one or more versions.
  data.each do |row_type, row_data|
    file_path = write_row_map_file(source_dir, row_type)
    next unless File.exists?(file_path)
    File.open(file_path, 'w') do |f|
      f.write('canonical')
      
      to_transpose = []
      row_data.sort.reverse.each do |version, version_data|
        to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
        to_transpose << [nil, version_data].flatten
      end
      
      # standardize row size
      max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
      to_transpose.each { |r| r[max_size - 1] ||= nil }
      transposed = to_transpose.transpose
      
      transposed.each do |transposed_data|
        transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
        canonical = transposed_data[1] # first description
        if canonical
          canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
          transposed_data = [canonical, transposed_data].flatten
        end
        f.write(transposed_data.join(','))
        f.write("\n")
      end
    end
  end

end