Class: Uniword::Docx::Package

Inherits:
Lutaml::Model::Serializable
  • Object
show all
Includes:
PackageDefaults, PackageSerialization
Defined in:
lib/uniword/docx/package.rb

Overview

DOCX Package - Complete DOCX file format model

Represents the entire .docx file structure as a lutaml-model object. Each XML file within the ZIP is a separate lutaml-model class.

A DOCX package CONTAINS OOXML markup wrapped in an OPC ZIP container. This class lives in Uniword::Docx, not Uniword::Ooxml, because DOCX is a file format that uses OOXML, not the other way around.

Examples:

Load DOCX

package = Package.from_file('document.docx')
package.core_properties.title = 'New Title'
package.to_file('output.docx')

Access document content

package = Package.from_file('document.docx')
package.document.body.paragraphs.each { |p| puts p.text }

Constant Summary

Constants included from PackageDefaults

Uniword::Docx::PackageDefaults::DOCUMENT_TO_PACKAGE_MAPPINGS

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from PackageSerialization

#inject_part_relationships, #serialize_infrastructure, #serialize_package_parts, #serialize_part

Methods included from PackageDefaults

included

Instance Attribute Details

#bibliography_sourcesObject

Non-serialized attributes (DOCX packaging helpers)



89
90
91
# File 'lib/uniword/docx/package.rb', line 89

def bibliography_sources
  @bibliography_sources
end

#chart_partsObject

Non-serialized attributes (DOCX packaging helpers)



89
90
91
# File 'lib/uniword/docx/package.rb', line 89

def chart_parts
  @chart_parts
end

#custom_xml_itemsObject

Custom XML data items (customXml/item*.xml)



50
51
52
# File 'lib/uniword/docx/package.rb', line 50

def custom_xml_items
  @custom_xml_items
end

#embeddingsObject

Returns the value of attribute embeddings.



90
91
92
# File 'lib/uniword/docx/package.rb', line 90

def embeddings
  @embeddings
end

#profileObject

Non-serialized attributes (DOCX packaging helpers)



89
90
91
# File 'lib/uniword/docx/package.rb', line 89

def profile
  @profile
end

#settings_relsObject

Returns the value of attribute settings_rels.



90
91
92
# File 'lib/uniword/docx/package.rb', line 90

def settings_rels
  @settings_rels
end

Class Method Details



296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/uniword/docx/package.rb', line 296

def self.extract_header_footer_parts(zip_content, package)
  return unless package.document && package.document_rels

  header_files = zip_content.keys.grep(%r{^word/header\d+\.xml$})
  footer_files = zip_content.keys.grep(%r{^word/footer\d+\.xml$})

  return if header_files.empty? && footer_files.empty?

  package.document.header_footer_parts ||= []

  header_files.sort.each do |path|
    target = path.sub("word/", "")
    rel = package.document_rels.relationships.find do |r|
      r.target == target &&
        r.type.to_s.include?("officeDocument/2006/relationships/header")
    end
    next unless rel

    package.document.header_footer_parts << {
      r_id: rel.id,
      target: target,
      rel_type: rel.type,
      content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml",
      content: Uniword::Wordprocessingml::Header.from_xml(zip_content[path]),
    }
  end

  footer_files.sort.each do |path|
    target = path.sub("word/", "")
    rel = package.document_rels.relationships.find do |r|
      r.target == target &&
        r.type.to_s.include?("officeDocument/2006/relationships/footer")
    end
    next unless rel

    package.document.header_footer_parts << {
      r_id: rel.id,
      target: target,
      rel_type: rel.type,
      content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml",
      content: Uniword::Wordprocessingml::Footer.from_xml(zip_content[path]),
    }
  end
end

.extract_image_parts(zip_content, package, zip_path = nil) ⇒ Object

Extract image files from word/media/ directory in DOCX

Parameters:

  • zip_content (Hash)

    Extracted ZIP content (may have corrupted binary)

  • package (Package)

    Package to populate

  • zip_path (String, nil) (defaults to: nil)

    Original ZIP path for binary re-extraction



346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
# File 'lib/uniword/docx/package.rb', line 346

def self.extract_image_parts(zip_content, package, zip_path = nil)
  return unless package.document

  media_files = zip_content.keys.grep(%r{^word/media/.+$})
  return if media_files.empty?

  package.document.image_parts ||= {}

  media_files.each do |media_path|
    filename = File.basename(media_path)
    ext = File.extname(filename).delete(".").downcase
    content_type = case ext
                   when "jpg", "jpeg" then "image/jpeg"
                   when "png" then "image/png"
                   when "gif" then "image/gif"
                   when "bmp" then "image/bmp"
                   when "tiff", "tif" then "image/tiff"
                   when "svg" then "image/svg+xml"
                   else "image/#{ext}"
                   end

    r_id = "rIdImg#{package.document.image_parts.size + 1}"

    binary_data = if zip_path
                    read_binary_from_zip(zip_path, media_path)
                  else
                    zip_content[media_path]
                  end

    package.document.image_parts[r_id] = {
      data: binary_data,
      target: "media/#{filename}",
      content_type: content_type
    }
  end
end

.extract_theme_media(zip_content) ⇒ Object

Extract media files from word/theme/media/ directory



410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
# File 'lib/uniword/docx/package.rb', line 410

def self.extract_theme_media(zip_content)
  media = {}

  zip_content.each_key do |file_path|
    next unless file_path =~ %r{^word/theme/media/(.+)$}

    filename = Regexp.last_match(1)
    media[filename] = Uniword::Themes::MediaFile.new(
      filename: filename,
      content: zip_content[file_path],
      source_path: file_path
    )
  end

  media
end

.find_document_rels_path(doc_path) ⇒ Object

Find the document relationships path from the main document path



501
502
503
504
505
506
507
# File 'lib/uniword/docx/package.rb', line 501

def self.find_document_rels_path(doc_path)
  return nil unless doc_path

  dir = File.dirname(doc_path)
  basename = File.basename(doc_path)
  File.join(dir, "_rels", "#{basename}.rels")
end

.find_main_document_path(package_rels) ⇒ Object

Find the main document path from package relationships



487
488
489
490
491
492
493
494
495
496
497
498
# File 'lib/uniword/docx/package.rb', line 487

def self.find_main_document_path(package_rels)
  return nil unless package_rels&.relationships

  rel = package_rels.relationships.find do |r|
    r.type.to_s.include?("officeDocument/2006/relationships/officeDocument")
  end
  return nil unless rel&.target

  path = rel.target.dup
  path.sub!(%r{^/}, "")
  path
end

.from_file(path) ⇒ Package

Load DOCX package from file

Parameters:

  • path (String)

    Path to .docx file

Returns:

  • (Package)

    Package with all parts loaded



96
97
98
99
100
# File 'lib/uniword/docx/package.rb', line 96

def self.from_file(path)
  extractor = Infrastructure::ZipExtractor.new
  zip_content = extractor.extract(path)
  from_zip_content(zip_content, path)
end

.from_zip_content(zip_content, zip_path = nil) ⇒ Package

Create package from extracted ZIP content

Parameters:

  • zip_content (Hash)

    Extracted ZIP files

  • zip_path (String, nil) (defaults to: nil)

    Original ZIP path for binary re-extraction

Returns:



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# File 'lib/uniword/docx/package.rb', line 107

def self.from_zip_content(zip_content, zip_path = nil)
  package = new

  # Parse Content Types
  if zip_content["[Content_Types].xml"]
    package.content_types = Uniword::ContentTypes::Types.from_xml(
      zip_content["[Content_Types].xml"]
    )
  end

  # Parse Package Relationships
  if zip_content["_rels/.rels"]
    package.package_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content["_rels/.rels"]
    )
  end

  # Find the main document path from officeDocument relationship
  main_doc_path = find_main_document_path(package.package_rels)
  main_doc_rels_path = find_document_rels_path(main_doc_path)

  # Parse Document Properties
  if zip_content["docProps/core.xml"]
    package.core_properties = Ooxml::CoreProperties.from_xml(
      zip_content["docProps/core.xml"]
    )
  end

  if zip_content["docProps/app.xml"]
    package.app_properties = Ooxml::AppProperties.from_xml(
      zip_content["docProps/app.xml"]
    )
  end

  # Parse Custom Properties
  if zip_content["docProps/custom.xml"]
    package.custom_properties = Ooxml::CustomProperties.from_xml(
      zip_content["docProps/custom.xml"]
    )
  end

  # Parse Custom XML Data items (customXml/item*.xml)
  custom_xml_files = zip_content.keys.grep(%r{^customXml/item(\d+)\.xml$})
  if custom_xml_files.any?
    package.custom_xml_items = []
    custom_xml_files.sort_by { |f| f[/item(\d+)/, 1].to_i }.each do |item_path|
      index = item_path[/item(\d+)/, 1].to_i
      item = {
        index: index,
        xml_content: zip_content[item_path]
      }

      props_path = "customXml/itemProps#{index}.xml"
      item[:props_xml] = zip_content[props_path] if zip_content[props_path]

      rels_path = "customXml/_rels/item#{index}.xml.rels"
      item[:rels_xml] = zip_content[rels_path] if zip_content[rels_path]

      package.custom_xml_items << item
    end
  end

  # Parse Document Parts - use dynamic path from package relationships
  if main_doc_path && zip_content[main_doc_path]
    package.document = Uniword::Wordprocessingml::DocumentRoot.from_xml(
      zip_content[main_doc_path]
    )
  elsif zip_content["word/document.xml"]
    package.document = Uniword::Wordprocessingml::DocumentRoot.from_xml(
      zip_content["word/document.xml"]
    )
  end

  if zip_content["word/styles.xml"]
    package.styles = Uniword::Wordprocessingml::StylesConfiguration.from_xml(
      zip_content["word/styles.xml"]
    )
  end

  if zip_content["word/numbering.xml"]
    package.numbering = Uniword::Wordprocessingml::NumberingConfiguration.from_xml(
      zip_content["word/numbering.xml"]
    )
  end

  if zip_content["word/settings.xml"]
    package.settings = Uniword::Wordprocessingml::Settings.from_xml(
      zip_content["word/settings.xml"]
    )
  end

  if zip_content["word/_rels/settings.xml.rels"]
    package.settings_rels =
      Ooxml::Relationships::PackageRelationships.from_xml(
        zip_content["word/_rels/settings.xml.rels"]
      )
  end

  if zip_content["word/fontTable.xml"]
    package.font_table = Uniword::Wordprocessingml::FontTable.from_xml(
      zip_content["word/fontTable.xml"]
    )
  end

  if zip_content["word/webSettings.xml"]
    package.web_settings = Uniword::Wordprocessingml::WebSettings.from_xml(
      zip_content["word/webSettings.xml"]
    )
  end

  # Parse document relationships - use dynamic path based on main document
  if main_doc_rels_path && zip_content[main_doc_rels_path]
    package.document_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content[main_doc_rels_path]
    )
  elsif zip_content["word/_rels/document.xml.rels"]
    package.document_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content["word/_rels/document.xml.rels"]
    )
  end

  # Parse Theme
  if zip_content["word/theme/theme1.xml"]
    package.theme = Drawingml::Theme.from_xml(
      zip_content["word/theme/theme1.xml"]
    )

    theme_media = extract_theme_media(zip_content)
    package.theme.media_files = theme_media if theme_media.any?
  end

  if zip_content["word/theme/_rels/theme1.xml.rels"]
    package.theme_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content["word/theme/_rels/theme1.xml.rels"]
    )
  end

  # Parse Footnotes
  if zip_content["word/footnotes.xml"]
    package.footnotes = Uniword::Wordprocessingml::Footnotes.from_xml(
      zip_content["word/footnotes.xml"]
    )
  end

  # Parse Endnotes
  if zip_content["word/endnotes.xml"]
    package.endnotes = Uniword::Wordprocessingml::Endnotes.from_xml(
      zip_content["word/endnotes.xml"]
    )
  end

  # Parse Header and Footer parts
  extract_header_footer_parts(zip_content, package)

  # Parse Chart parts
  chart_files = zip_content.keys.grep(%r{^word/charts/chart\d+\.xml$})
  if chart_files.any? && package.document_rels
    package.document.chart_parts ||= {}
    chart_files.each do |chart_path|
      chart_target = chart_path.sub("word/", "")
      rel = package.document_rels.relationships.find do |r|
        r.target == chart_target &&
          r.type.to_s.include?("officeDocument/2006/relationships/chart")
      end
      next unless rel

      package.document.chart_parts[rel.id] = {
        xml: zip_content[chart_path],
        target: chart_target
      }
    end
  end

  # Extract image parts from word/media/ directory
  extract_image_parts(zip_content, package, zip_path)

  # Extract OLE/embedded object binaries from word/embeddings/
  embedding_files = zip_content.keys.grep(%r{^word/embeddings/.+$})
  if embedding_files.any?
    package.embeddings = {}
    embedding_files.each do |emb_path|
      target = emb_path.sub("word/", "")
      package.embeddings[target] = zip_content[emb_path]
    end
  end

  package
end

.read_binary_from_zip(zip_path, entry_path) ⇒ Object

Read binary data directly from ZIP file without UTF-8 encoding



384
385
386
387
388
389
390
391
392
# File 'lib/uniword/docx/package.rb', line 384

def self.read_binary_from_zip(zip_path, entry_path)
  require "zip"
  Zip::File.open(zip_path) do |zip_file|
    entry = zip_file.find_entry(entry_path)
    return nil unless entry

    entry.get_input_stream.read
  end
end

.to_file(document, path, profile: nil) ⇒ Object

Save document to file (class method for DocumentWriter compatibility)



395
396
397
398
399
400
401
402
403
404
405
406
407
# File 'lib/uniword/docx/package.rb', line 395

def self.to_file(document, path, profile: nil)
  package = new
  package.document = document
  package.profile = profile || Profile.defaults
  copy_document_parts_to_package(document, package)
  package.content_types ||= minimal_content_types
  package.package_rels ||= minimal_package_rels
  package.document_rels ||= minimal_document_rels
  package.settings ||= Uniword::Wordprocessingml::Settings.new
  package.font_table ||= Uniword::Wordprocessingml::FontTable.new
  package.web_settings ||= Uniword::Wordprocessingml::WebSettings.new
  package.to_file(path)
end

Instance Method Details

#bodyObject



464
465
466
# File 'lib/uniword/docx/package.rb', line 464

def body
  document&.body
end

#chartsObject



478
479
480
# File 'lib/uniword/docx/package.rb', line 478

def charts
  document&.charts || []
end

#each_paragraphObject



472
473
474
# File 'lib/uniword/docx/package.rb', line 472

def each_paragraph(&)
  paragraphs.each(&)
end

#paragraphsObject

Delegate common DocumentRoot methods for API compatibility



456
457
458
# File 'lib/uniword/docx/package.rb', line 456

def paragraphs
  document&.paragraphs || []
end

#styles_configurationObject



482
483
484
# File 'lib/uniword/docx/package.rb', line 482

def styles_configuration
  document&.styles_configuration
end

#tablesObject



460
461
462
# File 'lib/uniword/docx/package.rb', line 460

def tables
  document&.tables || []
end

#textObject



468
469
470
# File 'lib/uniword/docx/package.rb', line 468

def text
  document&.text || ""
end

#to_file(path) ⇒ Object Also known as: save

Save package to file



428
429
430
431
432
# File 'lib/uniword/docx/package.rb', line 428

def to_file(path)
  zip_content = to_zip_content
  packager = Infrastructure::ZipPackager.new
  packager.package(zip_content, path)
end

#to_zip_contentObject

Generate ZIP content hash



435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
# File 'lib/uniword/docx/package.rb', line 435

def to_zip_content
  content = {}

  self.content_types ||= self.class.minimal_content_types
  self.package_rels ||= self.class.minimal_package_rels
  self.document_rels ||= self.class.minimal_document_rels

  self.settings ||= Uniword::Wordprocessingml::Settings.new
  self.font_table ||= Uniword::Wordprocessingml::FontTable.new
  self.web_settings ||= Uniword::Wordprocessingml::WebSettings.new

  Reconciler.new(self, profile: profile || Profile.defaults).reconcile

  inject_part_relationships(content, content_types, package_rels, document_rels)
  serialize_package_parts(content, content_types, package_rels, document_rels)

  content
end