Class: DocbookStatus::Status

Inherits:
Object
  • Object
show all
Defined in:
lib/docbook_status/status.rb

Overview

Analyzes DocBook 5 documents for document structure (sections) and text length.

Constant Summary collapse

DOCBOOK_NS =

The DocBook 5 namespace URL

'http://docbook.org/ns/docbook'
XINCLUDE_NS =

The XInclude namespace URL

'http://www.w3.org/2001/XInclude'
@@text_elements =

Elements whose contents is counted as text. The formalpara elements are included implicitly because they contain para child elements.

['para','simpara']
@@section_elements =

Section elements, following the list given in docbook.org/tdg5/en/html/ch02.html#roots except for the refsect… elements.

%w[
  acknowledgements appendix article
  bibliography book
  chapter colophon
  dedication
  glossary
  index
  part preface
  section sect1 sect2 sect3 sect4 set simplesect
  toc
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(fname = nil) ⇒ Status

Returns a new instance of Status.



41
42
43
44
45
46
47
48
49
# File 'lib/docbook_status/status.rb', line 41

def initialize(fname=nil)
  @sections = []
  @remarks = []
  @source = fname
  @source_dir = fname.nil? ? nil : File.dirname(fname)
  @source_file = fname.nil? ? nil : File.basename(fname)
  @doc = nil
  XML.default_line_numbers=true
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



39
40
41
# File 'lib/docbook_status/status.rb', line 39

def doc
  @doc
end

Instance Method Details

#analyze_document(doc) ⇒ Object

Searches the XML document for sections and word counts. Returns an array of sections (map) with title, word count, section level and DocBook tag.



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/docbook_status/status.rb', line 211

def analyze_document(doc)
  # Add a namespace declaration for XPath expressions
  doc.root.namespaces.default_prefix = 'db'
  # Analyze the document starting with the root node
  doc_maps = check_node(doc.root,0,[])
  @sections = []
  section_name = doc_maps[0][:title]
  section_type = doc_maps[0][:name]
  section_ctr = 0
  section_level = 0
  doc_ctr = 0
  #puts doc_maps.inspect
  xms = doc_maps.drop(1)
  # Compute word counts per section
  xms.each do |m|
    if (m[:type] == :para)
      doc_ctr += m[:words]
      section_ctr += m[:words]
    else
      @sections << {:title => section_name, :words => section_ctr, :level => section_level, :tag => section_type}
      section_name = m[:title]
      section_ctr = 0
      section_level = m[:level]
      section_type = m[:name]
    end
  end
  @sections << {:title => section_name, :words => section_ctr, :level => section_level, :tag => section_type}
  # Put the document word count near the document type
  @sections[0][:words] = doc_ctr
  @sections
end

#analyze_fileObject

Open the XML document, check for the DocBook5 namespace and finally apply Xinclude tretement to it, if it has a XInclude namespace. Returns a map with the file name, the file’s modification time, and the section structure.

Raises:

  • (ArgumentError)


247
248
249
250
251
252
253
254
255
# File 'lib/docbook_status/status.rb', line 247

def analyze_file
  full_name = File.expand_path(@source)
  changed  = File.ctime(@source)
  @doc = XML::Document.file(@source)
  raise ArgumentError, "Error: #{@source} is apparently not DocBook 5." unless is_docbook?(@doc)
  @doc.xinclude if has_xinclude?(@doc)
  sections = analyze_document(@doc)
  {:file => full_name, :modified => changed, :sections => sections}
end

#check_node(node, level, ctr) ⇒ Object

Check the document elements for content and type recursively, starting at the current node. Returns an array with paragraph and section maps.



106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/docbook_status/status.rb', line 106

def check_node(node, level, ctr)
  if (@@text_elements.include? node.name)
    ctr << {:type => :para, :level => level, :words => count_content_words(node)}
  elsif (@@section_elements.include? node.name)
    title = find_section_title(node)
    ctr << {:type => :section, :level => level, :title => title, :name => node.name}
    node.children.each {|inner_elem| check_node(inner_elem, level+1, ctr)} if node.children?
  else
    node.children.each {|inner_elem| check_node(inner_elem, level+1, ctr)} if node.children?
  end

  ctr
end

#count_content_words(node) ⇒ Object

Counts the words in the contents of the given node. It is assumed that the node is a kind of pure content (a paragraph) and therefore everything in it should be included in the word count. An exception to this are remark elements, which are conisdered as comments, not meant for final publication.



79
80
81
82
83
84
# File 'lib/docbook_status/status.rb', line 79

def count_content_words(node)
  ws = count_words(node)
  # Count the remark text contained in the paragraph and subtract it from the real thing
  wsr = node.find('db:remark').reduce(0) {|m,r| m+count_words(r)}
  ws - wsr
end

#count_words(node) ⇒ Object

Counts the words in the contents of the given node. Word in this context means something that is delimited by space charactes and starts with word characters (in the regexp sense).



69
70
71
72
# File 'lib/docbook_status/status.rb', line 69

def count_words(node)
  words = node.content.strip.split(/[[:space:]]+/).find_all {|w| w =~ /\w+/}
  words.size
end

#find_remarks(filter = []) ⇒ Object

Finds the remarks by looking through all the Xincluded files



186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/docbook_status/status.rb', line 186

def find_remarks(filter=[])
  if (@source.nil?)
    rfiles = find_xincludes(@doc)
  else
    @doc = XML::Document.file(@source)
    rfiles = [@source_file] + find_xincludes(@doc)
  end
  @remarks = rfiles.map {|rf|
    ind = XML::Document.file(File.expand_path(rf,@source.nil? ? '.' : @source_dir))
    ind.root.namespaces.default_prefix = 'db'
    rems = find_remarks_in_doc(ind, rf)
    rems
  }.flatten
  if (filter.empty?)
    @remarks
  else
    filter.map {|f|
      @remarks.find_all {|r| f.casecmp(r[:keyword]) == 0}
    }.flatten
  end
end

#find_remarks_in_doc(doc, source) ⇒ Object

Find all remark elements in the document and return a map for every such element. The map contains:

  • keyword: if the first word of the content is uppercase that is the keyword, else REMARK

  • text: the content of the remark element, minus the keyword

  • file: the name of the source file

  • line: the line number in the source file

OPTIMIZE look for ‘role’ attributes as keywords?



167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/docbook_status/status.rb', line 167

def find_remarks_in_doc(doc,source)
  rems = doc.find('//db:remark')
  rems.map {|rem|
    c = rem.content.strip
    kw = 'REMARK'
    if rem.first.text?
      kw1 = c.match('^([[:upper:]]+)([[:space:][:punct:]]|$)')
      unless kw1.nil?
        kw = kw1[1]
        c = kw1.post_match.lstrip
      end
    end
    # TODO XPath integrieren? :path => rem.path, :parent => rem.parent.path,
    {:keyword => kw, :text => c, :file=>source, :line => rem.line_num}
  }
end

#find_section_title(node) ⇒ Object

Find the title of the current section. That element is either directly following or inside an info element. Return the empty string if no title can be found.



90
91
92
93
94
95
96
97
98
99
100
# File 'lib/docbook_status/status.rb', line 90

def find_section_title(node)
  title = node.find_first('./db:title')
  if title.nil?
    title = node.find_first './db:info/db:title'
  end
  if title.nil?
    ""
  else
    title.content
  end
end

#find_xincludes(doc) ⇒ Object

Finds and returns all XInclude files/URLs in a document.

OPTIMIZE implement xpointer and fallback handling for xi:include? see www.w3.org/TR/xinclude/



143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/docbook_status/status.rb', line 143

def find_xincludes(doc)
  if has_xinclude?(doc)
    xincs = doc.find('//xi:include', "xi:"+XINCLUDE_NS)
    xfiles = xincs.map {|x| x.attributes['href'] }
    (xfiles << xfiles.map {|xf|
                 xfn = File.exists?(xf) ? xf : File.expand_path(xf,File.dirname(doc.root.base_uri))
                 xdoc = XML::Document.file(xfn)
                 find_xincludes(xdoc)
               }).flatten
  else
    []
  end
end

#has_xinclude?(doc) ⇒ Boolean

Check whether the document has a XInclude namespace

Returns:

  • (Boolean)


127
128
129
130
131
132
133
134
135
136
# File 'lib/docbook_status/status.rb', line 127

def has_xinclude?(doc)
  ret = false
  doc.root.namespaces.each do |ns|
    if (ns.href.casecmp(XINCLUDE_NS) == 0)
      ret = true
      break
    end
  end
  ret
end

#is_docbook?(doc) ⇒ Boolean

Check whether the document has a DocBook default namespace

Returns:

  • (Boolean)


121
122
123
124
# File 'lib/docbook_status/status.rb', line 121

def is_docbook?(doc)
  dbns = doc.root.namespaces.default
  (!dbns.nil? && (dbns.href.casecmp(DOCBOOK_NS) == 0))
end

#remarks(keyword = nil) ⇒ Object

Return the remark-elements found in the document. If keyword is nil then return all remarks, else only the ones with the right keyword.



56
57
58
59
60
61
62
63
# File 'lib/docbook_status/status.rb', line 56

def remarks(keyword=nil)
  if keyword.nil?
    @remarks
  else
    ukw = keyword.upcase
    @remarks.find_all {|r| r[:keyword] == (ukw)}
  end
end