Class: Docx::Document

Inherits:
Object
  • Object
show all
Includes:
SimpleInspect
Defined in:
lib/docx/document.rb

Overview

The Document class wraps around a docx file and provides methods to interface with it.

# get a Docx::Document for a docx file in the local directory
doc = Docx::Document.open("test.docx")

# get the text from the document
puts doc.text

# do the same thing in a block
Docx::Document.open("test.docx") do |d|
  puts d.text
end

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from SimpleInspect

#inspect

Constructor Details

#initialize(path_or_io, options = {}) ⇒ Document

Returns a new instance of Document.



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/docx/document.rb', line 27

def initialize(path_or_io, options = {})
  @replace = {}

  # if path-or_io is string && does not contain a null byte
  if (path_or_io.instance_of?(String) && !/\u0000/.match?(path_or_io))
    @zip = Zip::File.open(path_or_io)
  else
    @zip = Zip::File.open_buffer(path_or_io)
  end

  document = @zip.glob('word/document*.xml').first
  raise Errno::ENOENT if document.nil?

  @document_xml = document.get_input_stream.read
  @doc = Nokogiri::XML(@document_xml)
  load_styles
  load_headers
  load_footers
  yield(self) if block_given?
ensure
  @zip.close unless @zip.nil?
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



25
26
27
# File 'lib/docx/document.rb', line 25

def doc
  @doc
end

#footersObject (readonly)

Returns the value of attribute footers.



25
26
27
# File 'lib/docx/document.rb', line 25

def footers
  @footers
end

#headersObject (readonly)

Returns the value of attribute headers.



25
26
27
# File 'lib/docx/document.rb', line 25

def headers
  @headers
end

#stylesObject (readonly)

Returns the value of attribute styles.



25
26
27
# File 'lib/docx/document.rb', line 25

def styles
  @styles
end

#xmlObject (readonly)

Returns the value of attribute xml.



25
26
27
# File 'lib/docx/document.rb', line 25

def xml
  @xml
end

#zipObject (readonly)

Returns the value of attribute zip.



25
26
27
# File 'lib/docx/document.rb', line 25

def zip
  @zip
end

Class Method Details

.open(path, &block) ⇒ Object

With no associated block, Docx::Document.open is a synonym for Docx::Document.new. If the optional code block is given, it will be passed the opened docx file as an argument and the Docx::Document oject will automatically be closed when the block terminates. The values of the block will be returned from Docx::Document.open. call-seq:

open(filepath) => file
open(filepath) {|file| block } => obj


62
63
64
# File 'lib/docx/document.rb', line 62

def self.open(path, &block)
  new(path, &block)
end

Instance Method Details

#bookmarksObject



70
71
72
73
74
75
76
77
# File 'lib/docx/document.rb', line 70

def bookmarks
  bkmrks_hsh = {}
  bkmrks_ary = @doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node }
  # auto-generated by office 2010
  bkmrks_ary.reject! { |b| b.name == '_GoBack' }
  bkmrks_ary.each { |b| bkmrks_hsh[b.name] = b }
  bkmrks_hsh
end

#default_paragraph_styleObject



179
180
181
# File 'lib/docx/document.rb', line 179

def default_paragraph_style
  @styles.at_xpath("w:styles/w:style[@w:type='paragraph' and @w:default='1']/w:name/@w:val").value
end

#document_propertiesObject

This stores the current global document properties, for now



51
52
53
54
55
56
# File 'lib/docx/document.rb', line 51

def document_properties
  {
    font_size: font_size,
    hyperlinks: hyperlinks
  }
end

#each_paragraphObject

Deprecated

Iterates over paragraphs within document call-seq:

each_paragraph => Enumerator


114
115
116
# File 'lib/docx/document.rb', line 114

def each_paragraph
  paragraphs.each { |p| yield(p) }
end

#font_sizeObject

Some documents have this set, others don’t. Values are returned as half-points, so to get points, that’s why it’s divided by 2.



89
90
91
92
93
94
95
# File 'lib/docx/document.rb', line 89

def font_size
  size_value = @styles&.at_xpath('//w:docDefaults//w:rPrDefault//w:rPr//w:sz/@w:val')&.value

  return nil unless size_value

  size_value.to_i / 2
end


104
105
106
# File 'lib/docx/document.rb', line 104

def hyperlink_relationships
  @rels.xpath("//xmlns:Relationship[contains(@Type,'hyperlink')]")
end

Hyperlink targets are extracted from the document.xml.rels file



98
99
100
101
102
# File 'lib/docx/document.rb', line 98

def hyperlinks
  hyperlink_relationships.each_with_object({}) do |rel, hash|
    hash[rel.attributes['Id'].value] = rel.attributes['Target'].value
  end
end

#paragraphsObject



66
67
68
# File 'lib/docx/document.rb', line 66

def paragraphs
  @doc.xpath('//w:document//w:body/w:p').map { |p_node| parse_paragraph_from p_node }
end

#replace_entry(entry_path, file_contents) ⇒ Object



175
176
177
# File 'lib/docx/document.rb', line 175

def replace_entry(entry_path, file_contents)
  @replace[entry_path] = file_contents
end

#save(path) ⇒ Object

Save document to provided path call-seq:

save(filepath) => void


132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/docx/document.rb', line 132

def save(path)
  with_zip64_disabled do
    update
    Zip::OutputStream.open(path) do |out|
      zip.each do |entry|
        next unless entry.file?

        out.put_next_entry(entry.name)
        value = @replace[entry.name] || zip.read(entry.name)

        out.write(value)
      end

    end
    zip.close
  end
end

#streamObject

Output entire document as a StringIO object



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/docx/document.rb', line 151

def stream
  with_zip64_disabled do
    update
    stream = Zip::OutputStream.write_buffer do |out|
      zip.each do |entry|
        next unless entry.file?

        out.put_next_entry(entry.name)

        if @replace[entry.name]
          out.write(@replace[entry.name])
        else
          out.write(zip.read(entry.name))
        end
      end
    end

    stream.rewind
    stream
  end
end

#style_name_of(style_id) ⇒ Object



183
184
185
# File 'lib/docx/document.rb', line 183

def style_name_of(style_id)
  styles_configuration.style_of(style_id).name
end

#styles_configurationObject



187
188
189
# File 'lib/docx/document.rb', line 187

def styles_configuration
  @styles_configuration ||= Elements::Containers::StylesConfiguration.new(@styles.dup)
end

#tablesObject



83
84
85
# File 'lib/docx/document.rb', line 83

def tables
  @doc.xpath('//w:document//w:body//w:tbl').map { |t_node| parse_table_from t_node }
end

#to_htmlObject

Output entire document as a String HTML fragment



125
126
127
# File 'lib/docx/document.rb', line 125

def to_html
  paragraphs.map(&:to_html).join("\n")
end

#to_sObject Also known as: text

call-seq:

to_s -> string


120
121
122
# File 'lib/docx/document.rb', line 120

def to_s
  paragraphs.map(&:to_s).join("\n")
end

#to_xmlObject



79
80
81
# File 'lib/docx/document.rb', line 79

def to_xml
  Nokogiri::XML(@document_xml)
end