Class: Factbook::Builder

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging
Defined in:
lib/factbook-readers/builder.rb

Overview

todo: change to HtmlBuilder or PageBuilder ???

Constant Summary collapse

H2_RE =
/<h2>
  \s*
 (.+?)  ## note: use non-greedy; do NOT allow tags inside for now
  \s*
 <\/h2>
/xim
H3_RE =
/<h3>
   \s*
  (.+?)                ## note: use non-greedy; allows tags inside - why? why not
   \s*
 <\/h3>
/xim

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html_original) ⇒ Builder

Returns a new instance of Builder.



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/factbook-readers/builder.rb', line 17

def initialize( html_original )
  @html_original = html_original

  @html, @info, @errors = Sanitizer.new.sanitize( @html_original )


  html_sects =  if @html.empty?
                   ## note: support "empty" pages - old format waiting for update!!!
                   ##    cannot parse for now
                   @html_debug = ''
                   []  ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!)
                else
                   @html_debug = map_sects( @html )
                   @html_debug = map_subsects( @html_debug )

                   split_sects( @html_debug )
                end

  pp html_sects

  ## debug
  ##   File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) }


  @sects = []
  html_sects.each do |html_sect|
    html_sect_head = html_sect[0]
    html_subsects  = html_sect[1]
    puts html_sect_head
    puts html_subsects.size

    ## get section title
    ##  @SECTION{Economy}  => Economy
    if html_sect_head =~ /@SECTION{(.+?)}/
      title = $1.strip
      puts title
      sect = Sect.new
      sect.title = title
      ## get subsections
      subsects = []
      html_subsects.each do |html_subsect|
        html_subsect_head = html_subsect[0]
        html_subsect_body = html_subsect[1]
        if html_subsect_head =~ /@SUBSECTION{(.+?)}/
          title = $1.strip
          title = title.sub( /:\z/, '' )    # remove trailing : if present
          title = title.strip

          puts title
          subsect = Subsect.new
          subsect.title = title     ## todo/fix: cut off trailing colon (:)

          b = Factbook::ItemBuilder.new( html_subsect_body, title )
          h = b.read
          subsect.data = h

          subsects << subsect
        else
          ## warn/fix: no subsection title found
        end
      end
      sect.subsects = subsects
      @sects << sect
    else
      ## warn/fix:  no section title found
    end
  end
end

Instance Attribute Details

#errorsObject (readonly)

full “original” 1:1 page



9
10
11
# File 'lib/factbook-readers/builder.rb', line 9

def errors
  @errors
end

#htmlObject (readonly)

full “original” 1:1 page



9
10
11
# File 'lib/factbook-readers/builder.rb', line 9

def html
  @html
end

#html_debugObject (readonly)

full “original” 1:1 page



9
10
11
# File 'lib/factbook-readers/builder.rb', line 9

def html_debug
  @html_debug
end

#html_originalObject (readonly)

full “original” 1:1 page



9
10
11
# File 'lib/factbook-readers/builder.rb', line 9

def html_original
  @html_original
end

#infoObject (readonly)

full “original” 1:1 page



9
10
11
# File 'lib/factbook-readers/builder.rb', line 9

def info
  @info
end

#sectsObject (readonly)

full “original” 1:1 page



9
10
11
# File 'lib/factbook-readers/builder.rb', line 9

def sects
  @sects
end

Instance Method Details

#map_sects(html) ⇒ Object



94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/factbook-readers/builder.rb', line 94

def map_sects( html )
   ## convert section titles to "unified" marker
   ## e.g.
   ##   <h2>Introduction</h2>

  html = html.gsub( H2_RE ) do |m|
     puts "** found section >#{$1}<:"
     puts "   >|#{m}|<"

     "\n\n@SECTION{#{$1}}\n\n"
  end
  html
end

#map_subsects(html) ⇒ Object



116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/factbook-readers/builder.rb', line 116

def map_subsects( html )
   ## convert subsection titles to "unified" marker
   ## e.g.
   ##  <h3>Disputes - international:</h3>

  html = html.gsub( H3_RE ) do |m|
     puts "** found subsection >#{$1}<:"
     puts "   >|#{m}|<"

     "\n@SUBSECTION{#{$1}}\n"
  end
  html
end

#split_sects(html) ⇒ Object



132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/factbook-readers/builder.rb', line 132

def split_sects( html )
  ####
  #  split html in sections (divided by section headings)
  #  e.g. remove optional prolog ??,
  ##   [[heading,sect],
  ##    [heading,sect],
  ##    [heading,sect],...]

  ## note: "wrap" regex in a capture group (just one)
  ##   String#split will include all catpure groups in the result array

  ## note: use non-greedy -- check: need to escape {} ??
  chunks = html.split( /(@SECTION{.+?})/ )

  ## check if first item is a section or (html) prolog
  #   if prolog (remove)
  chunks.slice!(0)  unless chunks[0] =~ /@SECTION/  ## starts w/ @SECTION

  pairs = chunks.each_slice(2).to_a

  ## now split subsections
  newpairs = []
  pairs.each do |item|
    ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not??
    newpairs << [item[0], split_subsects( item[1]) ]
  end
  newpairs
end

#split_subsects(html) ⇒ Object



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/factbook-readers/builder.rb', line 162

def split_subsects( html )
  ####
  #  split html in subsections (divided by subsection headings)
  #  e.g. remove optional prolog ??,
  ##   [[heading,sect],
  ##    [heading,sect],
  ##    [heading,sect],...]

  ## note: "wrap" regex in a capture group (just one)
  ##   String#split will include all catpure groups in the result array

  ## note: use non-greedy -- check: need to escape {} ??
  chunks = html.split( /(@SUBSECTION{.+?})/ )

  ## check if first item is a section or (html) prolog
  #   if prolog (remove)
  chunks.slice!(0)  unless chunks[0] =~ /@SUBSECTION/  ## starts w/ @SUBSECTION

  pairs = chunks.each_slice(2).to_a
  pairs
end