Class: Docx::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/docx/document.rb

Overview

The Document class wraps around a docx file and provides methods to interface with it.

# get a Docx::Document for a docx file in the local directory
doc = Docx::Document.open("test.docx")

# get the text from the document
puts doc.text

# do the same thing in a block
Docx::Document.open("test.docx") do |d|
  puts d.text
end

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path_or_io, options = {}) ⇒ Document

Returns a new instance of Document.



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/docx/document.rb', line 23

def initialize(path_or_io, options = {})
  @replace = {}

  # if path-or_io is string && does not contain a null byte
  if (path_or_io.instance_of?(String) && !/\u0000/.match?(path_or_io))
    @zip = Zip::File.open(path_or_io)
  else
    @zip = Zip::File.open_buffer(path_or_io)
  end

  document = @zip.glob('word/document*.xml').first
  raise Errno::ENOENT if document.nil?

  @document_xml = document.get_input_stream.read
  @doc = Nokogiri::XML(@document_xml)
  load_styles
  yield(self) if block_given?
ensure
  @zip.close
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



21
22
23
# File 'lib/docx/document.rb', line 21

def doc
  @doc
end

#stylesObject (readonly)

Returns the value of attribute styles.



21
22
23
# File 'lib/docx/document.rb', line 21

def styles
  @styles
end

#xmlObject (readonly)

Returns the value of attribute xml.



21
22
23
# File 'lib/docx/document.rb', line 21

def xml
  @xml
end

#zipObject (readonly)

Returns the value of attribute zip.



21
22
23
# File 'lib/docx/document.rb', line 21

def zip
  @zip
end

Class Method Details

.open(path, &block) ⇒ Object

With no associated block, Docx::Document.open is a synonym for Docx::Document.new. If the optional code block is given, it will be passed the opened docx file as an argument and the Docx::Document oject will automatically be closed when the block terminates. The values of the block will be returned from Docx::Document.open. call-seq:

open(filepath) => file
open(filepath) {|file| block } => obj


56
57
58
# File 'lib/docx/document.rb', line 56

def self.open(path, &block)
  new(path, &block)
end

Instance Method Details

#bookmarksObject



64
65
66
67
68
69
70
71
# File 'lib/docx/document.rb', line 64

def bookmarks
  bkmrks_hsh = {}
  bkmrks_ary = @doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node }
  # auto-generated by office 2010
  bkmrks_ary.reject! { |b| b.name == '_GoBack' }
  bkmrks_ary.each { |b| bkmrks_hsh[b.name] = b }
  bkmrks_hsh
end

#document_propertiesObject

This stores the current global document properties, for now



45
46
47
48
49
50
# File 'lib/docx/document.rb', line 45

def document_properties
  {
    font_size: font_size,
    hyperlinks: hyperlinks
  }
end

#each_paragraphObject

Deprecated

Iterates over paragraphs within document call-seq:

each_paragraph => Enumerator


103
104
105
# File 'lib/docx/document.rb', line 103

def each_paragraph
  paragraphs.each { |p| yield(p) }
end

#font_sizeObject

Some documents have this set, others don’t. Values are returned as half-points, so to get points, that’s why it’s divided by 2.



79
80
81
82
83
84
# File 'lib/docx/document.rb', line 79

def font_size
  return nil unless @styles

  size_tag = @styles.xpath('//w:docDefaults//w:rPrDefault//w:rPr//w:sz').first
  size_tag ? size_tag.attributes['val'].value.to_i / 2 : nil
end


93
94
95
# File 'lib/docx/document.rb', line 93

def hyperlink_relationships
  @rels.xpath("//xmlns:Relationship[contains(@Type,'hyperlink')]")
end

Hyperlink targets are extracted from the document.xml.rels file



87
88
89
90
91
# File 'lib/docx/document.rb', line 87

def hyperlinks
  hyperlink_relationships.each_with_object({}) do |rel, hash|
    hash[rel.attributes['Id'].value] = rel.attributes['Target'].value
  end
end

#paragraphsObject



60
61
62
# File 'lib/docx/document.rb', line 60

def paragraphs
  @doc.xpath('//w:document//w:body/w:p').map { |p_node| parse_paragraph_from p_node }
end

#replace_entry(entry_path, file_contents) ⇒ Object



162
163
164
# File 'lib/docx/document.rb', line 162

def replace_entry(entry_path, file_contents)
  @replace[entry_path] = file_contents
end

#save(path) ⇒ Object

Save document to provided path call-seq:

save(filepath) => void


121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/docx/document.rb', line 121

def save(path)
  update
  Zip::OutputStream.open(path) do |out|
    zip.each do |entry|
      next unless entry.file?

      out.put_next_entry(entry.name)

      if @replace[entry.name]
        out.write(@replace[entry.name])
      else
        out.write(zip.read(entry.name))
      end
    end
  end
  zip.close
end

#streamObject

Output entire document as a StringIO object



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/docx/document.rb', line 140

def stream
  update
  stream = Zip::OutputStream.write_buffer do |out|
    zip.each do |entry|
      next unless entry.file?

      out.put_next_entry(entry.name)

      if @replace[entry.name]
        out.write(@replace[entry.name])
      else
        out.write(zip.read(entry.name))
      end
    end
  end

  stream.rewind
  stream
end

#tablesObject



73
74
75
# File 'lib/docx/document.rb', line 73

def tables
  @doc.xpath('//w:document//w:body//w:tbl').map { |t_node| parse_table_from t_node }
end

#to_htmlObject

Output entire document as a String HTML fragment



114
115
116
# File 'lib/docx/document.rb', line 114

def to_html
  paragraphs.map(&:to_html).join("\n")
end

#to_sObject Also known as: text

call-seq:

to_s -> string


109
110
111
# File 'lib/docx/document.rb', line 109

def to_s
  paragraphs.map(&:to_s).join("\n")
end