Class: Uniword::Docx::Package

Inherits:

Lutaml::Model::Serializable

Object
Lutaml::Model::Serializable
Uniword::Docx::Package

show all

Includes:: PackageDefaults, PackageSerialization

Defined in:: lib/uniword/docx/package.rb

Overview

DOCX Package - Complete DOCX file format model

Represents the entire .docx file structure as a lutaml-model object. Each XML file within the ZIP is a separate lutaml-model class.

A DOCX package CONTAINS OOXML markup wrapped in an OPC ZIP container. This class lives in Uniword::Docx, not Uniword::Ooxml, because DOCX is a file format that uses OOXML, not the other way around.

Examples:

Load DOCX

package = Package.from_file('document.docx')
package.core_properties.title = 'New Title'
package.to_file('output.docx')

Access document content

package = Package.from_file('document.docx')
package.document.body.paragraphs.each { |p| puts p.text }

Constant Summary

Constants included from PackageDefaults

Uniword::Docx::PackageDefaults::DOCUMENT_TO_PACKAGE_MAPPINGS

Instance Attribute Summary collapse

#bibliography_sources ⇒ Object

Non-serialized attributes (DOCX packaging helpers).
#chart_parts ⇒ Object

Non-serialized attributes (DOCX packaging helpers).
#custom_xml_items ⇒ Object

Custom XML data items (customXml/item*.xml).
#profile ⇒ Object

Non-serialized attributes (DOCX packaging helpers).

Class Method Summary collapse

.extract_header_footer_parts(zip_content, package) ⇒ Object
.extract_image_parts(zip_content, package, zip_path = nil) ⇒ Object

Extract image files from word/media/ directory in DOCX.
.extract_theme_media(zip_content) ⇒ Object

Extract media files from word/theme/media/ directory.
.find_document_rels_path(doc_path) ⇒ Object

Find the document relationships path from the main document path.
.find_main_document_path(package_rels) ⇒ Object

Find the main document path from package relationships.
.from_file(path) ⇒ Package

Load DOCX package from file.
.from_zip_content(zip_content, zip_path = nil) ⇒ Package

Create package from extracted ZIP content.
.read_binary_from_zip(zip_path, entry_path) ⇒ Object

Read binary data directly from ZIP file without UTF-8 encoding.
.to_file(document, path, profile: nil) ⇒ Object

Save document to file (class method for DocumentWriter compatibility).

Instance Method Summary collapse

#body ⇒ Object
#charts ⇒ Object
#each_paragraph ⇒ Object
#paragraphs ⇒ Object

Delegate common DocumentRoot methods for API compatibility.
#styles_configuration ⇒ Object
#tables ⇒ Object
#text ⇒ Object
#to_file(path) ⇒ Object (also: #save)

Save package to file.
#to_zip_content ⇒ Object

Generate ZIP content hash.

Methods included from PackageSerialization

#inject_part_relationships, #serialize_package_parts

Methods included from PackageDefaults

included

Instance Attribute Details

#bibliography_sources ⇒ `Object`

Non-serialized attributes (DOCX packaging helpers)



89
90
91

# File 'lib/uniword/docx/package.rb', line 89

def bibliography_sources
  @bibliography_sources
end

#chart_parts ⇒ `Object`

Non-serialized attributes (DOCX packaging helpers)



89
90
91

# File 'lib/uniword/docx/package.rb', line 89

def chart_parts
  @chart_parts
end

#custom_xml_items ⇒ `Object`

Custom XML data items (customXml/item*.xml)



50
51
52

# File 'lib/uniword/docx/package.rb', line 50

def custom_xml_items
  @custom_xml_items
end

#profile ⇒ `Object`

Non-serialized attributes (DOCX packaging helpers)



89
90
91

# File 'lib/uniword/docx/package.rb', line 89

def profile
  @profile
end

Class Method Details

.extract_header_footer_parts(zip_content, package) ⇒ `Object`

# File 'lib/uniword/docx/package.rb', line 278

def self.extract_header_footer_parts(zip_content, package)
  return unless package.document && package.document_rels

  header_files = zip_content.keys.grep(%r{^word/header\d+\.xml$})
  footer_files = zip_content.keys.grep(%r{^word/footer\d+\.xml$})

  return if header_files.empty? && footer_files.empty?

  package.document.header_footer_parts ||= []

  header_files.sort.each do |path|
    target = path.sub("word/", "")
    rel = package.document_rels.relationships.find do |r|
      r.target == target &&
        r.type.to_s.include?("officeDocument/2006/relationships/header")
    end
    next unless rel

    package.document.header_footer_parts << {
      r_id: rel.id,
      target: target,
      rel_type: rel.type,
      content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml",
      content: Uniword::Wordprocessingml::Header.from_xml(zip_content[path]),
    }
  end

  footer_files.sort.each do |path|
    target = path.sub("word/", "")
    rel = package.document_rels.relationships.find do |r|
      r.target == target &&
        r.type.to_s.include?("officeDocument/2006/relationships/footer")
    end
    next unless rel

    package.document.header_footer_parts << {
      r_id: rel.id,
      target: target,
      rel_type: rel.type,
      content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml",
      content: Uniword::Wordprocessingml::Footer.from_xml(zip_content[path]),
    }
  end
end

.extract_image_parts(zip_content, package, zip_path = nil) ⇒ `Object`

Extract image files from word/media/ directory in DOCX

Parameters:

zip_content (Hash) —

Extracted ZIP content (may have corrupted binary)
package (Package) —

Package to populate
zip_path (String, nil) (defaults to: nil) —

Original ZIP path for binary re-extraction

# File 'lib/uniword/docx/package.rb', line 328

def self.extract_image_parts(zip_content, package, zip_path = nil)
  return unless package.document

  media_files = zip_content.keys.grep(%r{^word/media/.+$})
  return if media_files.empty?

  package.document.image_parts ||= {}

  media_files.each do |media_path|
    filename = File.basename(media_path)
    ext = File.extname(filename).delete(".").downcase
    content_type = case ext
                   when "jpg", "jpeg" then "image/jpeg"
                   when "png" then "image/png"
                   when "gif" then "image/gif"
                   when "bmp" then "image/bmp"
                   when "tiff", "tif" then "image/tiff"
                   when "svg" then "image/svg+xml"
                   else "image/#{ext}"
                   end

    r_id = "rIdImg#{package.document.image_parts.size + 1}"

    binary_data = if zip_path
                    read_binary_from_zip(zip_path, media_path)
                  else
                    zip_content[media_path]
                  end

    package.document.image_parts[r_id] = {
      data: binary_data,
      target: "media/#{filename}",
      content_type: content_type
    }
  end
end

.extract_theme_media(zip_content) ⇒ `Object`

Extract media files from word/theme/media/ directory

# File 'lib/uniword/docx/package.rb', line 392

def self.extract_theme_media(zip_content)
  media = {}

  zip_content.each_key do |file_path|
    next unless file_path =~ %r{^word/theme/media/(.+)$}

    filename = Regexp.last_match(1)
    media[filename] = Uniword::Themes::MediaFile.new(
      filename: filename,
      content: zip_content[file_path],
      source_path: file_path
    )
  end

  media
end

.find_document_rels_path(doc_path) ⇒ `Object`

Find the document relationships path from the main document path

# File 'lib/uniword/docx/package.rb', line 483

def self.find_document_rels_path(doc_path)
  return nil unless doc_path

  dir = File.dirname(doc_path)
  basename = File.basename(doc_path)
  File.join(dir, "_rels", "#{basename}.rels")
end

.find_main_document_path(package_rels) ⇒ `Object`

Find the main document path from package relationships

# File 'lib/uniword/docx/package.rb', line 469

def self.find_main_document_path(package_rels)
  return nil unless package_rels&.relationships

  rel = package_rels.relationships.find do |r|
    r.type.to_s.include?("officeDocument/2006/relationships/officeDocument")
  end
  return nil unless rel&.target

  path = rel.target.dup
  path.sub!(%r{^/}, "")
  path
end

.from_file(path) ⇒ `Package`

Load DOCX package from file

Parameters:

path (String) —

Path to .docx file

Returns:

(Package) —

Package with all parts loaded

# File 'lib/uniword/docx/package.rb', line 95

def self.from_file(path)
  extractor = Infrastructure::ZipExtractor.new
  zip_content = extractor.extract(path)
  from_zip_content(zip_content, path)
end

.from_zip_content(zip_content, zip_path = nil) ⇒ `Package`

Create package from extracted ZIP content

Parameters:

zip_content (Hash) —

Extracted ZIP files
zip_path (String, nil) (defaults to: nil) —

Original ZIP path for binary re-extraction

Returns:

(Package) —

Package object

# File 'lib/uniword/docx/package.rb', line 106

def self.from_zip_content(zip_content, zip_path = nil)
  package = new

  # Parse Content Types
  if zip_content["[Content_Types].xml"]
    package.content_types = Uniword::ContentTypes::Types.from_xml(
      zip_content["[Content_Types].xml"]
    )
  end

  # Parse Package Relationships
  if zip_content["_rels/.rels"]
    package.package_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content["_rels/.rels"]
    )
  end

  # Find the main document path from officeDocument relationship
  main_doc_path = find_main_document_path(package.package_rels)
  main_doc_rels_path = find_document_rels_path(main_doc_path)

  # Parse Document Properties
  if zip_content["docProps/core.xml"]
    package.core_properties = Ooxml::CoreProperties.from_xml(
      zip_content["docProps/core.xml"]
    )
  end

  if zip_content["docProps/app.xml"]
    package.app_properties = Ooxml::AppProperties.from_xml(
      zip_content["docProps/app.xml"]
    )
  end

  # Parse Custom Properties
  if zip_content["docProps/custom.xml"]
    package.custom_properties = Ooxml::CustomProperties.from_xml(
      zip_content["docProps/custom.xml"]
    )
  end

  # Parse Custom XML Data items (customXml/item*.xml)
  custom_xml_files = zip_content.keys.grep(%r{^customXml/item(\d+)\.xml$})
  if custom_xml_files.any?
    package.custom_xml_items = []
    custom_xml_files.sort_by { |f| f[/item(\d+)/, 1].to_i }.each do |item_path|
      index = item_path[/item(\d+)/, 1].to_i
      item = {
        index: index,
        xml_content: zip_content[item_path]
      }

      props_path = "customXml/itemProps#{index}.xml"
      item[:props_xml] = zip_content[props_path] if zip_content[props_path]

      rels_path = "customXml/_rels/item#{index}.xml.rels"
      item[:rels_xml] = zip_content[rels_path] if zip_content[rels_path]

      package.custom_xml_items << item
    end
  end

  # Parse Document Parts - use dynamic path from package relationships
  if main_doc_path && zip_content[main_doc_path]
    package.document = Uniword::Wordprocessingml::DocumentRoot.from_xml(
      zip_content[main_doc_path]
    )
  elsif zip_content["word/document.xml"]
    package.document = Uniword::Wordprocessingml::DocumentRoot.from_xml(
      zip_content["word/document.xml"]
    )
  end

  if zip_content["word/styles.xml"]
    package.styles = Uniword::Wordprocessingml::StylesConfiguration.from_xml(
      zip_content["word/styles.xml"]
    )
  end

  if zip_content["word/numbering.xml"]
    package.numbering = Uniword::Wordprocessingml::NumberingConfiguration.from_xml(
      zip_content["word/numbering.xml"]
    )
  end

  if zip_content["word/settings.xml"]
    package.settings = Uniword::Wordprocessingml::Settings.from_xml(
      zip_content["word/settings.xml"]
    )
  end

  if zip_content["word/fontTable.xml"]
    package.font_table = Uniword::Wordprocessingml::FontTable.from_xml(
      zip_content["word/fontTable.xml"]
    )
  end

  if zip_content["word/webSettings.xml"]
    package.web_settings = Uniword::Wordprocessingml::WebSettings.from_xml(
      zip_content["word/webSettings.xml"]
    )
  end

  # Parse document relationships - use dynamic path based on main document
  if main_doc_rels_path && zip_content[main_doc_rels_path]
    package.document_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content[main_doc_rels_path]
    )
  elsif zip_content["word/_rels/document.xml.rels"]
    package.document_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content["word/_rels/document.xml.rels"]
    )
  end

  # Parse Theme
  if zip_content["word/theme/theme1.xml"]
    package.theme = Drawingml::Theme.from_xml(
      zip_content["word/theme/theme1.xml"]
    )

    theme_media = extract_theme_media(zip_content)
    package.theme.media_files = theme_media if theme_media.any?
  end

  if zip_content["word/theme/_rels/theme1.xml.rels"]
    package.theme_rels = Ooxml::Relationships::PackageRelationships.from_xml(
      zip_content["word/theme/_rels/theme1.xml.rels"]
    )
  end

  # Parse Footnotes
  if zip_content["word/footnotes.xml"]
    package.footnotes = Uniword::Wordprocessingml::Footnotes.from_xml(
      zip_content["word/footnotes.xml"]
    )
  end

  # Parse Endnotes
  if zip_content["word/endnotes.xml"]
    package.endnotes = Uniword::Wordprocessingml::Endnotes.from_xml(
      zip_content["word/endnotes.xml"]
    )
  end

  # Parse Header and Footer parts
  extract_header_footer_parts(zip_content, package)

  # Parse Chart parts
  chart_files = zip_content.keys.grep(%r{^word/charts/chart\d+\.xml$})
  if chart_files.any? && package.document_rels
    package.document.chart_parts ||= {}
    chart_files.each do |chart_path|
      chart_target = chart_path.sub("word/", "")
      rel = package.document_rels.relationships.find do |r|
        r.target == chart_target &&
          r.type.to_s.include?("officeDocument/2006/relationships/chart")
      end
      next unless rel

      package.document.chart_parts[rel.id] = {
        xml: zip_content[chart_path],
        target: chart_target
      }
    end
  end

  # Extract image parts from word/media/ directory
  extract_image_parts(zip_content, package, zip_path)

  package
end

.read_binary_from_zip(zip_path, entry_path) ⇒ `Object`

Read binary data directly from ZIP file without UTF-8 encoding

# File 'lib/uniword/docx/package.rb', line 366

def self.read_binary_from_zip(zip_path, entry_path)
  require "zip"
  Zip::File.open(zip_path) do |zip_file|
    entry = zip_file.find_entry(entry_path)
    return nil unless entry

    entry.get_input_stream.read
  end
end

.to_file(document, path, profile: nil) ⇒ `Object`

Save document to file (class method for DocumentWriter compatibility)

# File 'lib/uniword/docx/package.rb', line 377

def self.to_file(document, path, profile: nil)
  package = new
  package.document = document
  package.profile = profile || Profile.defaults
  copy_document_parts_to_package(document, package)
  package.content_types ||= minimal_content_types
  package.package_rels ||= minimal_package_rels
  package.document_rels ||= minimal_document_rels
  package.settings ||= Uniword::Wordprocessingml::Settings.new
  package.font_table ||= Uniword::Wordprocessingml::FontTable.new
  package.web_settings ||= Uniword::Wordprocessingml::WebSettings.new
  package.to_file(path)
end

Instance Method Details

#body ⇒ `Object`



446
447
448

# File 'lib/uniword/docx/package.rb', line 446

def body
  document&.body
end

#charts ⇒ `Object`



460
461
462

# File 'lib/uniword/docx/package.rb', line 460

def charts
  document&.charts || []
end

#each_paragraph ⇒ `Object`



454
455
456

# File 'lib/uniword/docx/package.rb', line 454

def each_paragraph(&)
  paragraphs.each(&)
end

#paragraphs ⇒ `Object`

Delegate common DocumentRoot methods for API compatibility



438
439
440

# File 'lib/uniword/docx/package.rb', line 438

def paragraphs
  document&.paragraphs || []
end

#styles_configuration ⇒ `Object`



464
465
466

# File 'lib/uniword/docx/package.rb', line 464

def styles_configuration
  document&.styles_configuration
end

#tables ⇒ `Object`



442
443
444

# File 'lib/uniword/docx/package.rb', line 442

def tables
  document&.tables || []
end

#text ⇒ `Object`



450
451
452

# File 'lib/uniword/docx/package.rb', line 450

def text
  document&.text || ""
end

#to_file(path) ⇒ `Object` Also known as: save

Save package to file

# File 'lib/uniword/docx/package.rb', line 410

def to_file(path)
  zip_content = to_zip_content
  packager = Infrastructure::ZipPackager.new
  packager.package(zip_content, path)
end

#to_zip_content ⇒ `Object`

Generate ZIP content hash

# File 'lib/uniword/docx/package.rb', line 417

def to_zip_content
  content = {}

  self.content_types ||= self.class.minimal_content_types
  self.package_rels ||= self.class.minimal_package_rels
  self.document_rels ||= self.class.minimal_document_rels

  self.settings ||= Uniword::Wordprocessingml::Settings.new
  self.font_table ||= Uniword::Wordprocessingml::FontTable.new
  self.web_settings ||= Uniword::Wordprocessingml::WebSettings.new

  Reconciler.new(self, profile: profile || Profile.defaults).reconcile

  inject_part_relationships(content, content_types, package_rels, document_rels)
  serialize_package_parts(content, content_types, package_rels, document_rels)

  content
end

Class: Uniword::Docx::Package

Overview

Examples:

Load DOCX

Access document content

Constant Summary

Constants included from PackageDefaults

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from PackageSerialization

Methods included from PackageDefaults

Instance Attribute Details

#bibliography_sources ⇒ Object

#chart_parts ⇒ Object

#custom_xml_items ⇒ Object

#profile ⇒ Object

Class Method Details

.extract_header_footer_parts(zip_content, package) ⇒ Object

.extract_image_parts(zip_content, package, zip_path = nil) ⇒ Object

.extract_theme_media(zip_content) ⇒ Object

.find_document_rels_path(doc_path) ⇒ Object

.find_main_document_path(package_rels) ⇒ Object

.from_file(path) ⇒ Package

.from_zip_content(zip_content, zip_path = nil) ⇒ Package

.read_binary_from_zip(zip_path, entry_path) ⇒ Object

.to_file(document, path, profile: nil) ⇒ Object

Instance Method Details

#body ⇒ Object

#charts ⇒ Object

#each_paragraph ⇒ Object

#paragraphs ⇒ Object

#styles_configuration ⇒ Object

#tables ⇒ Object

#text ⇒ Object

#to_file(path) ⇒ Object Also known as: save

#to_zip_content ⇒ Object

#bibliography_sources ⇒ `Object`

#chart_parts ⇒ `Object`

#custom_xml_items ⇒ `Object`

#profile ⇒ `Object`

.extract_header_footer_parts(zip_content, package) ⇒ `Object`

.extract_image_parts(zip_content, package, zip_path = nil) ⇒ `Object`

.extract_theme_media(zip_content) ⇒ `Object`

.find_document_rels_path(doc_path) ⇒ `Object`

.find_main_document_path(package_rels) ⇒ `Object`

.from_file(path) ⇒ `Package`

.from_zip_content(zip_content, zip_path = nil) ⇒ `Package`

.read_binary_from_zip(zip_path, entry_path) ⇒ `Object`

.to_file(document, path, profile: nil) ⇒ `Object`

#body ⇒ `Object`

#charts ⇒ `Object`

#each_paragraph ⇒ `Object`

#paragraphs ⇒ `Object`

#styles_configuration ⇒ `Object`

#tables ⇒ `Object`

#text ⇒ `Object`

#to_file(path) ⇒ `Object` Also known as: save

#to_zip_content ⇒ `Object`