Class: DocbookStatus::Status

Inherits:
Object
  • Object
show all
Defined in:
lib/docbook_status/status.rb

Overview

Analyzes DocBook 5 documents for document structure (sections) and text length.

Constant Summary collapse

DOCBOOK_NS =

The DocBook 5 namespace URL

'http://docbook.org/ns/docbook'
XINCLUDE_NS =

The XInclude namespace URL

'http://www.w3.org/2001/XInclude'
STD_REMARK =

Standard remark keyword, if there is none entered

'REMARK'
EMPTY_REMARK =

Standard remark text for remarks without content

'-- EMPTY REMARK --'
@@text_elements =

Elements whose contents is counted as text. The formalpara elements are included implicitly because they contain para child elements.

['para','simpara']
@@section_elements =

Section elements, following the list given in docbook.org/tdg5/en/html/ch02.html#roots except for the refsect… elements.

%w[
  acknowledgements appendix article
  bibliography book
  chapter colophon
  dedication
  glossary
  index
  part preface
  section sect1 sect2 sect3 sect4 set simplesect
  toc
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(fname = nil) ⇒ Status

Returns a new instance of Status.



49
50
51
52
53
54
55
56
57
# File 'lib/docbook_status/status.rb', line 49

def initialize(fname=nil)
  @sections = []
  @remarks = []
  @source = fname
  @source_dir = fname.nil? ? nil : File.dirname(fname)
  @source_file = fname.nil? ? nil : File.basename(fname)
  @doc = nil
  XML.default_line_numbers=true
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



47
48
49
# File 'lib/docbook_status/status.rb', line 47

def doc
  @doc
end

Instance Method Details

#analyze_document(doc) ⇒ Object

Searches the XML document for sections and word counts. Returns an array of sections (map) with title, word count, section level and DocBook tag.



252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# File 'lib/docbook_status/status.rb', line 252

def analyze_document(doc)
  # Add a namespace declaration for XPath expressions
  doc.root.namespaces.default_prefix = 'db'
  # Analyze the document starting with the root node
  doc_maps = check_node(doc.root,0,[])
  @sections = []
  section_name = doc_maps[0][:title]
  section_type = doc_maps[0][:name]
  section_ctr = 0
  section_level = 0
  max_section_level = 0
  doc_ctr = 0
  xms = doc_maps.drop(1)
  # Compute word counts per section
  xms.each do |m|
    if (m[:type] == :para)
      doc_ctr += m[:words]
      section_ctr += m[:words]
    else
      @sections << {:title => section_name, :words => section_ctr, :level => section_level, :tag => section_type}
      section_name = m[:title]
      section_ctr = 0
      section_level = m[:level]
      max_section_level = m[:level] if (m[:level] > max_section_level)
      section_type = m[:name]
    end
  end
  @sections << {:title => section_name, :words => section_ctr, :level => section_level, :tag => section_type}
  # OPTIMIZE Not nice, but works
  @sections = sum_sections(@sections,max_section_level).map {|s|
    s[:words] = s[:words]+s[:swords];
    s.delete(:swords)
    s
  }
  @sections
end

#analyze_fileObject

Open the XML document, check for the DocBook5 namespace and finally apply Xinclude tretement to it, if it has a XInclude namespace. Returns a map with the file name, the file’s modification time, and the section structure.

Raises:

  • (ArgumentError)


293
294
295
296
297
298
299
300
301
# File 'lib/docbook_status/status.rb', line 293

def analyze_file
  full_name = File.expand_path(@source)
  changed  = File.mtime(@source)
  @doc = XML::Document.file(@source)
  raise ArgumentError, "Error: #{@source} is apparently not DocBook 5." unless is_docbook?(@doc)
  @doc.xinclude if has_xinclude?(@doc)
  sections = analyze_document(@doc)
  {:file => full_name, :modified => changed, :sections => sections}
end

#check_node(node, level, ctr) ⇒ Object

Check the document elements for content and type recursively, starting at the current node. Returns an array with paragraph and section maps.



114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/docbook_status/status.rb', line 114

def check_node(node, level, ctr)
  if (@@text_elements.include? node.name)
    ctr << {:type => :para, :level => level, :words => count_content_words(node)}
  elsif (@@section_elements.include? node.name)
    title = find_section_title(node)
    ctr << {:type => :section, :level => level, :title => title, :name => node.name}
    node.children.each {|inner_elem| check_node(inner_elem, level+1, ctr)} if node.children?
  else
    node.children.each {|inner_elem| check_node(inner_elem, level+1, ctr)} if node.children?
  end

  ctr
end

#count_content_words(node) ⇒ Object

Counts the words in the contents of the given node. It is assumed that the node is a kind of pure content (a paragraph) and therefore everything in it should be included in the word count. An exception to this are remark elements, which are conisdered as comments, not meant for final publication.



87
88
89
90
91
92
# File 'lib/docbook_status/status.rb', line 87

def count_content_words(node)
  ws = count_words(node)
  # Count the remark text contained in the paragraph and subtract it from the real thing
  wsr = node.find('db:remark').reduce(0) {|m,r| m+count_words(r)}
  ws - wsr
end

#count_words(node) ⇒ Object

Counts the words in the contents of the given node. Word in this context means something that is delimited by space charactes and starts with word characters (in the regexp sense).



77
78
79
80
# File 'lib/docbook_status/status.rb', line 77

def count_words(node)
  words = node.content.strip.split(/[[:space:]]+/).find_all {|w| w =~ /\w+/}
  words.size
end

#find_remarks(filter = []) ⇒ Object

Finds the remarks by looking through all the Xincluded files

The remarks returned can be filtered by keyword if an keyword array is passed as an argument.



199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/docbook_status/status.rb', line 199

def find_remarks(filter=[])
  if (@source.nil?)
    rfiles = find_xincludes(@doc)
  else
    @doc = XML::Document.file(@source)
    rfiles = [@source_file] + find_xincludes(@doc)
  end
  @remarks = rfiles.map {|rf|
    ind = XML::Document.file(File.expand_path(rf,@source.nil? ? '.' : @source_dir))
    ind.root.namespaces.default_prefix = 'db'
    rems = find_remarks_in_doc(ind, rf)
    rems
  }.flatten
  if (filter.empty?)
    @remarks
  else
    filter.map {|f|
      @remarks.find_all {|r| f.casecmp(r[:keyword]) == 0}
    }.flatten
  end
end

#find_remarks_in_doc(doc, source) ⇒ Object

Find all remark elements in the document and return a map for every such element. The map contains:

  • keyword: if the first word of the content is uppercase that is the keyword, else REMARK

  • text: the content of the remark element, minus the keyword

  • file: the name of the source file

  • line: the line number in the source file

OPTIMIZE look for ‘role’ attributes as keywords?



175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/docbook_status/status.rb', line 175

def find_remarks_in_doc(doc,source)
  rems = doc.find('//db:remark')
  rems.map {|rem|
    c = rem.content.strip
    kw = STD_REMARK
    unless c.empty?
      kw1 = c.match('^([[:upper:]]+)([[:space:][:punct:]]|$)')
      unless kw1.nil?
        kw = kw1[1]
        c = kw1.post_match.lstrip
      end
    else
      c = EMPTY_REMARK
    end
    # TODO XPath integrieren? :path => rem.path, :parent => rem.parent.path,
    {:keyword => kw, :text => c, :file=>source, :line => rem.line_num}
  }
end

#find_section_title(node) ⇒ Object

Find the title of the current section. That element is either directly following or inside an info element. Return the empty string if no title can be found.



98
99
100
101
102
103
104
105
106
107
108
# File 'lib/docbook_status/status.rb', line 98

def find_section_title(node)
  title = node.find_first('./db:title')
  if title.nil?
    title = node.find_first './db:info/db:title'
  end
  if title.nil?
    ""
  else
    title.content
  end
end

#find_xincludes(doc) ⇒ Object

Finds and returns all XInclude files/URLs in a document.

OPTIMIZE implement xpointer and fallback handling for xi:include? see www.w3.org/TR/xinclude/



151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/docbook_status/status.rb', line 151

def find_xincludes(doc)
  if has_xinclude?(doc)
    xincs = doc.find('//xi:include', "xi:"+XINCLUDE_NS)
    xfiles = xincs.map {|x| x.attributes['href'] }
    (xfiles << xfiles.map {|xf|
                 xfn = File.exists?(xf) ? xf : File.expand_path(xf,File.dirname(doc.root.base_uri))
                 xdoc = XML::Document.file(xfn)
                 find_xincludes(xdoc)
               }).flatten
  else
    []
  end
end

#has_xinclude?(doc) ⇒ Boolean

Check whether the document has a XInclude namespace

Returns:

  • (Boolean)


135
136
137
138
139
140
141
142
143
144
# File 'lib/docbook_status/status.rb', line 135

def has_xinclude?(doc)
  ret = false
  doc.root.namespaces.each do |ns|
    if (ns.href.casecmp(XINCLUDE_NS) == 0)
      ret = true
      break
    end
  end
  ret
end

#is_docbook?(doc) ⇒ Boolean

Check whether the document has a DocBook default namespace

Returns:

  • (Boolean)


129
130
131
132
# File 'lib/docbook_status/status.rb', line 129

def is_docbook?(doc)
  dbns = doc.root.namespaces.default
  (!dbns.nil? && (dbns.href.casecmp(DOCBOOK_NS) == 0))
end

#remarks(keyword = nil) ⇒ Object

Return the remark-elements found in the document. If keyword is nil then return all remarks, else only the ones with the right keyword.



64
65
66
67
68
69
70
71
# File 'lib/docbook_status/status.rb', line 64

def remarks(keyword=nil)
  if keyword.nil?
    @remarks
  else
    ukw = keyword.upcase
    @remarks.find_all {|r| r[:keyword] == (ukw)}
  end
end

#sum_lower_sections(secs, start, level) ⇒ Object

Helper for sum_sections



222
223
224
225
226
227
228
229
230
# File 'lib/docbook_status/status.rb', line 222

def sum_lower_sections(secs,start,level)
  i=start
  sum = 0
  while (i < secs.length && secs[i][:level] > level)
    sum += secs[i][:words]
    i += 1
  end
  [sum,i]
end

#sum_sections(secs, max_level) ⇒ Object

Sum the word counts of lower sections



233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# File 'lib/docbook_status/status.rb', line 233

def sum_sections(secs, max_level)
  0.upto(max_level) do |cur_level|
    i = 0
    while i < secs.length
      if (secs[i][:level] == cur_level)
        (ctr,ni) = sum_lower_sections(secs, i+1,cur_level)
        secs[i][:swords] = ctr
        i = ni
      else
        i += 1
      end
    end
  end
  secs
end