Class: Factbook::Builder
- Inherits:
-
Object
- Object
- Factbook::Builder
- Includes:
- LogUtils::Logging
- Defined in:
- lib/factbook-readers/builder.rb
Overview
todo: change to HtmlBuilder or PageBuilder ???
Constant Summary collapse
- H2_RE =
/<h2> \s* (.+?) ## note: use non-greedy; do NOT allow tags inside for now \s* <\/h2> /xim
- H3_RE =
/<h3> \s* (.+?) ## note: use non-greedy; allows tags inside - why? why not \s* <\/h3> /xim
Instance Attribute Summary collapse
-
#errors ⇒ Object
readonly
full “original” 1:1 page.
-
#html ⇒ Object
readonly
full “original” 1:1 page.
-
#html_debug ⇒ Object
readonly
full “original” 1:1 page.
-
#html_original ⇒ Object
readonly
full “original” 1:1 page.
-
#info ⇒ Object
readonly
full “original” 1:1 page.
-
#sects ⇒ Object
readonly
full “original” 1:1 page.
Instance Method Summary collapse
-
#initialize(html_original) ⇒ Builder
constructor
A new instance of Builder.
- #map_sects(html) ⇒ Object
- #map_subsects(html) ⇒ Object
- #split_sects(html) ⇒ Object
- #split_subsects(html) ⇒ Object
Constructor Details
#initialize(html_original) ⇒ Builder
Returns a new instance of Builder.
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/factbook-readers/builder.rb', line 17 def initialize( html_original ) @html_original = html_original @html, @info, @errors = Sanitizer.new.sanitize( @html_original ) html_sects = if @html.empty? ## note: support "empty" pages - old format waiting for update!!! ## cannot parse for now @html_debug = '' [] ## return empty (no) sections for now - sorry (its just one page with code cc anyway!!) else @html_debug = map_sects( @html ) @html_debug = map_subsects( @html_debug ) split_sects( @html_debug ) end pp html_sects ## debug ## File.open( 'tmp/br.debug.html', 'w:utf-8') { |f| f.write( @html_debug ) } @sects = [] html_sects.each do |html_sect| html_sect_head = html_sect[0] html_subsects = html_sect[1] puts html_sect_head puts html_subsects.size ## get section title ## @SECTION{Economy} => Economy if html_sect_head =~ /@SECTION{(.+?)}/ title = $1.strip puts title sect = Sect.new sect.title = title ## get subsections subsects = [] html_subsects.each do |html_subsect| html_subsect_head = html_subsect[0] html_subsect_body = html_subsect[1] if html_subsect_head =~ /@SUBSECTION{(.+?)}/ title = $1.strip title = title.sub( /:\z/, '' ) # remove trailing : if present title = title.strip puts title subsect = Subsect.new subsect.title = title ## todo/fix: cut off trailing colon (:) b = Factbook::ItemBuilder.new( html_subsect_body, title ) h = b.read subsect.data = h subsects << subsect else ## warn/fix: no subsection title found end end sect.subsects = subsects @sects << sect else ## warn/fix: no section title found end end end |
Instance Attribute Details
#errors ⇒ Object (readonly)
full “original” 1:1 page
9 10 11 |
# File 'lib/factbook-readers/builder.rb', line 9 def errors @errors end |
#html ⇒ Object (readonly)
full “original” 1:1 page
9 10 11 |
# File 'lib/factbook-readers/builder.rb', line 9 def html @html end |
#html_debug ⇒ Object (readonly)
full “original” 1:1 page
9 10 11 |
# File 'lib/factbook-readers/builder.rb', line 9 def html_debug @html_debug end |
#html_original ⇒ Object (readonly)
full “original” 1:1 page
9 10 11 |
# File 'lib/factbook-readers/builder.rb', line 9 def html_original @html_original end |
#info ⇒ Object (readonly)
full “original” 1:1 page
9 10 11 |
# File 'lib/factbook-readers/builder.rb', line 9 def info @info end |
#sects ⇒ Object (readonly)
full “original” 1:1 page
9 10 11 |
# File 'lib/factbook-readers/builder.rb', line 9 def sects @sects end |
Instance Method Details
#map_sects(html) ⇒ Object
94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/factbook-readers/builder.rb', line 94 def map_sects( html ) ## convert section titles to "unified" marker ## e.g. ## <h2>Introduction</h2> html = html.gsub( H2_RE ) do |m| puts "** found section >#{$1}<:" puts " >|#{m}|<" "\n\n@SECTION{#{$1}}\n\n" end html end |
#map_subsects(html) ⇒ Object
116 117 118 119 120 121 122 123 124 125 126 127 128 |
# File 'lib/factbook-readers/builder.rb', line 116 def map_subsects( html ) ## convert subsection titles to "unified" marker ## e.g. ## <h3>Disputes - international:</h3> html = html.gsub( H3_RE ) do |m| puts "** found subsection >#{$1}<:" puts " >|#{m}|<" "\n@SUBSECTION{#{$1}}\n" end html end |
#split_sects(html) ⇒ Object
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/factbook-readers/builder.rb', line 132 def split_sects( html ) #### # split html in sections (divided by section headings) # e.g. remove optional prolog ??, ## [[heading,sect], ## [heading,sect], ## [heading,sect],...] ## note: "wrap" regex in a capture group (just one) ## String#split will include all catpure groups in the result array ## note: use non-greedy -- check: need to escape {} ?? chunks = html.split( /(@SECTION{.+?})/ ) ## check if first item is a section or (html) prolog # if prolog (remove) chunks.slice!(0) unless chunks[0] =~ /@SECTION/ ## starts w/ @SECTION pairs = chunks.each_slice(2).to_a ## now split subsections newpairs = [] pairs.each do |item| ## todo: after cleanup prolog; remove @SECTION{} ?? - just keep title - why, why not?? newpairs << [item[0], split_subsects( item[1]) ] end newpairs end |
#split_subsects(html) ⇒ Object
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/factbook-readers/builder.rb', line 162 def split_subsects( html ) #### # split html in subsections (divided by subsection headings) # e.g. remove optional prolog ??, ## [[heading,sect], ## [heading,sect], ## [heading,sect],...] ## note: "wrap" regex in a capture group (just one) ## String#split will include all catpure groups in the result array ## note: use non-greedy -- check: need to escape {} ?? chunks = html.split( /(@SUBSECTION{.+?})/ ) ## check if first item is a section or (html) prolog # if prolog (remove) chunks.slice!(0) unless chunks[0] =~ /@SUBSECTION/ ## starts w/ @SUBSECTION pairs = chunks.each_slice(2).to_a pairs end |