Class: Uniword::Docx::Package

Inherits:
Lutaml::Model::Serializable
  • Object
show all
Includes:
PackageDefaults, PackageSerialization
Defined in:
lib/uniword/docx/package.rb

Overview

DOCX Package - Complete DOCX file format model

Represents the entire .docx file structure as a lutaml-model object. Each XML file within the ZIP is a separate lutaml-model class.

A DOCX package CONTAINS OOXML markup wrapped in an OPC ZIP container. This class lives in Uniword::Docx, not Uniword::Ooxml, because DOCX is a file format that uses OOXML, not the other way around.

Examples:

Load DOCX

package = Package.from_file('document.docx')
package.core_properties.title = 'New Title'
package.to_file('output.docx')

Access document content

package = Package.from_file('document.docx')
package.document.body.paragraphs.each { |p| puts p.text }

Constant Summary

Constants included from PackageDefaults

Uniword::Docx::PackageDefaults::DOCUMENT_TO_PACKAGE_MAPPINGS

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from PackageSerialization

#inject_part_relationships, #serialize_package_parts

Methods included from PackageDefaults

included

Instance Attribute Details

#bibliography_sourcesObject

Non-serialized attributes (DOCX packaging helpers)



89
90
91
# File 'lib/uniword/docx/package.rb', line 89

def bibliography_sources
  @bibliography_sources
end

#chart_partsObject

Non-serialized attributes (DOCX packaging helpers)



89
90
91
# File 'lib/uniword/docx/package.rb', line 89

def chart_parts
  @chart_parts
end

#custom_xml_itemsObject

Custom XML data items (customXml/item*.xml)



50
51
52
# File 'lib/uniword/docx/package.rb', line 50

def custom_xml_items
  @custom_xml_items
end

#profileObject

Non-serialized attributes (DOCX packaging helpers)



89
90
91
# File 'lib/uniword/docx/package.rb', line 89

def profile
  @profile
end

Class Method Details



278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# File 'lib/uniword/docx/package.rb', line 278

def self.extract_header_footer_parts(zip_content, package)
  return unless package.document && package.document_rels

  header_files = zip_content.keys.grep(%r{^word/header\d+\.xml$})
  footer_files = zip_content.keys.grep(%r{^word/footer\d+\.xml$})

  return if header_files.empty? && footer_files.empty?

  package.document.header_footer_parts ||= []

  header_files.sort.each do |path|
    target = path.sub("word/", "")
    rel = package.document_rels.relationships.find do |r|
      r.target == target &&
        r.type.to_s.include?("officeDocument/2006/relationships/header")
    end
    next unless rel

    package.document.header_footer_parts << {
      r_id: rel.id,
      target: target,
      rel_type: rel.type,
      content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml",
      content: Uniword::Wordprocessingml::Header.from_xml(zip_content[path]),
    }
  end

  footer_files.sort.each do |path|
    target = path.sub("word/", "")
    rel = package.document_rels.relationships.find do |r|
      r.target == target &&
        r.type.to_s.include?("officeDocument/2006/relationships/footer")
    end
    next unless rel

    package.document.header_footer_parts << {
      r_id: rel.id,
      target: target,
      rel_type: rel.type,
      content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml",
      content: Uniword::Wordprocessingml::Footer.from_xml(zip_content[path]),
    }
  end
end

.extract_image_parts(zip_content, package, zip_path = nil) ⇒ Object

Extract image files from word/media/ directory in DOCX

Parameters:

  • zip_content (Hash)

    Extracted ZIP content (may have corrupted binary)

  • package (Package)

    Package to populate

  • zip_path (String, nil) (defaults to: nil)

    Original ZIP path for binary re-extraction



328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# File 'lib/uniword/docx/package.rb', line 328

def self.extract_image_parts(zip_content, package, zip_path = nil)
  return unless package.document

  media_files = zip_content.keys.grep(%r{^word/media/.+$})
  return if media_files.empty?

  package.document.image_parts ||= {}

  media_files.each do |media_path|
    filename = File.basename(media_path)
    ext = File.extname(filename).delete(".").downcase
    content_type = case ext
                   when "jpg", "jpeg" then "image/jpeg"
                   when "png" then "image/png"
                   when "gif" then "image/gif"
                   when "bmp" then "image/bmp"
                   when "tiff", "tif" then "image/tiff"
                   when "svg" then "image/svg+xml"
                   else "image/#{ext}"
                   end

    r_id = "rIdImg#{package.document.image_parts.size + 1}"

    binary_data = if zip_path
                    read_binary_from_zip(zip_path, media_path)
                  else
                    zip_content[media_path]
                  end

    package.document.image_parts[r_id] = {
      data: binary_data,
      target: "media/#{filename}",
      content_type: content_type
    }
  end
end

.extract_theme_media(zip_content) ⇒ Object

Extract media files from word/theme/media/ directory



392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
# File 'lib/uniword/docx/package.rb', line 392

def self.extract_theme_media(zip_content)
  media = {}

  zip_content.each_key do |file_path|
    next unless file_path =~ %r{^word/theme/media/(.+)$}

    filename = Regexp.last_match(1)
    media[filename] = Uniword::Themes::MediaFile.new(
      filename: filename,
      content: zip_content[file_path],
      source_path: file_path
    )
  end

  media
end

.find_document_rels_path(doc_path) ⇒ Object

Find the document relationships path from the main document path



483
484
485
486
487
488
489
# File 'lib/uniword/docx/package.rb', line 483

def self.find_document_rels_path(doc_path)
  return nil unless doc_path

  dir = File.dirname(doc_path)
  basename = File.basename(doc_path)
  File.join(dir, "_rels", "#{basename}.rels")
end

.find_main_document_path(package_rels) ⇒ Object

Find the main document path from package relationships



469
470
471
472
473
474
475
476
477
478
479
480
# File 'lib/uniword/docx/package.rb', line 469

def self.find_main_document_path(package_rels)
  return nil unless package_rels&.relationships

  rel = package_rels.relationships.find do |r|
    r.type.to_s.include?("officeDocument/2006/relationships/officeDocument")
  end
  return nil unless rel&.target

  path = rel.target.dup
  path.sub!(%r{^/}, "")
  path
end

.from_file(path) ⇒ Package

Load DOCX package from file

Parameters:

  • path (String)

    Path to .docx file

Returns:

  • (Package)

    Package with all parts loaded



95
96
97
98
99
# File 'lib/uniword/docx/package.rb', line 95

def self.from_file(path)
  extractor = Infrastructure::ZipExtractor.new
  zip_content = extractor.extract(path)
  from_zip_content(zip_content, path)
end

.from_zip_content(zip_content, zip_path = nil) ⇒ Package

Create package from extracted ZIP content

Parameters:

  • zip_content (Hash)

    Extracted ZIP files

  • zip_path (String, nil) (defaults to: nil)

    Original ZIP path for binary re-extraction

Returns:



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
# File 'lib/uniword/docx/package.rb', line 106

def self.from_zip_content(zip_content, zip_path = nil)
  package = new

  # Parse Content Types
  if zip_content["[Content_Types].xml"]
    package.content_types = Uniword::ContentTypes::Types.from_xml(
      zip_content["[Content_Types].xml"]
    )
  end

  # Parse Package Relationships
  if zip_content["_rels/.rels"]
    package.package_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content["_rels/.rels"]
    )
  end

  # Find the main document path from officeDocument relationship
  main_doc_path = find_main_document_path(package.package_rels)
  main_doc_rels_path = find_document_rels_path(main_doc_path)

  # Parse Document Properties
  if zip_content["docProps/core.xml"]
    package.core_properties = Ooxml::CoreProperties.from_xml(
      zip_content["docProps/core.xml"]
    )
  end

  if zip_content["docProps/app.xml"]
    package.app_properties = Ooxml::AppProperties.from_xml(
      zip_content["docProps/app.xml"]
    )
  end

  # Parse Custom Properties
  if zip_content["docProps/custom.xml"]
    package.custom_properties = Ooxml::CustomProperties.from_xml(
      zip_content["docProps/custom.xml"]
    )
  end

  # Parse Custom XML Data items (customXml/item*.xml)
  custom_xml_files = zip_content.keys.grep(%r{^customXml/item(\d+)\.xml$})
  if custom_xml_files.any?
    package.custom_xml_items = []
    custom_xml_files.sort_by { |f| f[/item(\d+)/, 1].to_i }.each do |item_path|
      index = item_path[/item(\d+)/, 1].to_i
      item = {
        index: index,
        xml_content: zip_content[item_path]
      }

      props_path = "customXml/itemProps#{index}.xml"
      item[:props_xml] = zip_content[props_path] if zip_content[props_path]

      rels_path = "customXml/_rels/item#{index}.xml.rels"
      item[:rels_xml] = zip_content[rels_path] if zip_content[rels_path]

      package.custom_xml_items << item
    end
  end

  # Parse Document Parts - use dynamic path from package relationships
  if main_doc_path && zip_content[main_doc_path]
    package.document = Uniword::Wordprocessingml::DocumentRoot.from_xml(
      zip_content[main_doc_path]
    )
  elsif zip_content["word/document.xml"]
    package.document = Uniword::Wordprocessingml::DocumentRoot.from_xml(
      zip_content["word/document.xml"]
    )
  end

  if zip_content["word/styles.xml"]
    package.styles = Uniword::Wordprocessingml::StylesConfiguration.from_xml(
      zip_content["word/styles.xml"]
    )
  end

  if zip_content["word/numbering.xml"]
    package.numbering = Uniword::Wordprocessingml::NumberingConfiguration.from_xml(
      zip_content["word/numbering.xml"]
    )
  end

  if zip_content["word/settings.xml"]
    package.settings = Uniword::Wordprocessingml::Settings.from_xml(
      zip_content["word/settings.xml"]
    )
  end

  if zip_content["word/fontTable.xml"]
    package.font_table = Uniword::Wordprocessingml::FontTable.from_xml(
      zip_content["word/fontTable.xml"]
    )
  end

  if zip_content["word/webSettings.xml"]
    package.web_settings = Uniword::Wordprocessingml::WebSettings.from_xml(
      zip_content["word/webSettings.xml"]
    )
  end

  # Parse document relationships - use dynamic path based on main document
  if main_doc_rels_path && zip_content[main_doc_rels_path]
    package.document_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content[main_doc_rels_path]
    )
  elsif zip_content["word/_rels/document.xml.rels"]
    package.document_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content["word/_rels/document.xml.rels"]
    )
  end

  # Parse Theme
  if zip_content["word/theme/theme1.xml"]
    package.theme = Drawingml::Theme.from_xml(
      zip_content["word/theme/theme1.xml"]
    )

    theme_media = extract_theme_media(zip_content)
    package.theme.media_files = theme_media if theme_media.any?
  end

  if zip_content["word/theme/_rels/theme1.xml.rels"]
    package.theme_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content["word/theme/_rels/theme1.xml.rels"]
    )
  end

  # Parse Footnotes
  if zip_content["word/footnotes.xml"]
    package.footnotes = Uniword::Wordprocessingml::Footnotes.from_xml(
      zip_content["word/footnotes.xml"]
    )
  end

  # Parse Endnotes
  if zip_content["word/endnotes.xml"]
    package.endnotes = Uniword::Wordprocessingml::Endnotes.from_xml(
      zip_content["word/endnotes.xml"]
    )
  end

  # Parse Header and Footer parts
  extract_header_footer_parts(zip_content, package)

  # Parse Chart parts
  chart_files = zip_content.keys.grep(%r{^word/charts/chart\d+\.xml$})
  if chart_files.any? && package.document_rels
    package.document.chart_parts ||= {}
    chart_files.each do |chart_path|
      chart_target = chart_path.sub("word/", "")
      rel = package.document_rels.relationships.find do |r|
        r.target == chart_target &&
          r.type.to_s.include?("officeDocument/2006/relationships/chart")
      end
      next unless rel

      package.document.chart_parts[rel.id] = {
        xml: zip_content[chart_path],
        target: chart_target
      }
    end
  end

  # Extract image parts from word/media/ directory
  extract_image_parts(zip_content, package, zip_path)

  package
end

.read_binary_from_zip(zip_path, entry_path) ⇒ Object

Read binary data directly from ZIP file without UTF-8 encoding



366
367
368
369
370
371
372
373
374
# File 'lib/uniword/docx/package.rb', line 366

def self.read_binary_from_zip(zip_path, entry_path)
  require "zip"
  Zip::File.open(zip_path) do |zip_file|
    entry = zip_file.find_entry(entry_path)
    return nil unless entry

    entry.get_input_stream.read
  end
end

.to_file(document, path, profile: nil) ⇒ Object

Save document to file (class method for DocumentWriter compatibility)



377
378
379
380
381
382
383
384
385
386
387
388
389
# File 'lib/uniword/docx/package.rb', line 377

def self.to_file(document, path, profile: nil)
  package = new
  package.document = document
  package.profile = profile || Profile.defaults
  copy_document_parts_to_package(document, package)
  package.content_types ||= minimal_content_types
  package.package_rels ||= minimal_package_rels
  package.document_rels ||= minimal_document_rels
  package.settings ||= Uniword::Wordprocessingml::Settings.new
  package.font_table ||= Uniword::Wordprocessingml::FontTable.new
  package.web_settings ||= Uniword::Wordprocessingml::WebSettings.new
  package.to_file(path)
end

Instance Method Details

#bodyObject



446
447
448
# File 'lib/uniword/docx/package.rb', line 446

def body
  document&.body
end

#chartsObject



460
461
462
# File 'lib/uniword/docx/package.rb', line 460

def charts
  document&.charts || []
end

#each_paragraphObject



454
455
456
# File 'lib/uniword/docx/package.rb', line 454

def each_paragraph(&)
  paragraphs.each(&)
end

#paragraphsObject

Delegate common DocumentRoot methods for API compatibility



438
439
440
# File 'lib/uniword/docx/package.rb', line 438

def paragraphs
  document&.paragraphs || []
end

#styles_configurationObject



464
465
466
# File 'lib/uniword/docx/package.rb', line 464

def styles_configuration
  document&.styles_configuration
end

#tablesObject



442
443
444
# File 'lib/uniword/docx/package.rb', line 442

def tables
  document&.tables || []
end

#textObject



450
451
452
# File 'lib/uniword/docx/package.rb', line 450

def text
  document&.text || ""
end

#to_file(path) ⇒ Object Also known as: save

Save package to file



410
411
412
413
414
# File 'lib/uniword/docx/package.rb', line 410

def to_file(path)
  zip_content = to_zip_content
  packager = Infrastructure::ZipPackager.new
  packager.package(zip_content, path)
end

#to_zip_contentObject

Generate ZIP content hash



417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
# File 'lib/uniword/docx/package.rb', line 417

def to_zip_content
  content = {}

  self.content_types ||= self.class.minimal_content_types
  self.package_rels ||= self.class.minimal_package_rels
  self.document_rels ||= self.class.minimal_document_rels

  self.settings ||= Uniword::Wordprocessingml::Settings.new
  self.font_table ||= Uniword::Wordprocessingml::FontTable.new
  self.web_settings ||= Uniword::Wordprocessingml::WebSettings.new

  Reconciler.new(self, profile: profile || Profile.defaults).reconcile

  inject_part_relationships(content, content_types, package_rels, document_rels)
  serialize_package_parts(content, content_types, package_rels, document_rels)

  content
end