Class: Uniword::Mhtml::Document

Inherits:

Lutaml::Model::Serializable

Object
Lutaml::Model::Serializable
Uniword::Mhtml::Document

show all

Includes:: DocumentInput

Defined in:: lib/uniword/mhtml/document.rb

Overview

MHTML Document — top-level model for .mht/.mhtml/.doc files.

This is COMPLETELY SEPARATE from OOXML Wordprocessingml::DocumentRoot. MHTML uses MIME multipart format with HTML content, not ZIP + OOXML XML.

Structure:

Mhtml::Document
  ├── html_part (HtmlPart) — main document HTML
  ├── parts[] (MimePart) — all MIME parts (images, XML, theme, etc.)
  ├── document_properties (Metadata::DocumentProperties)
  ├── word_document_settings (Metadata::WordDocumentSettings)
  └── filelist_xml (String)

Instance Method Summary collapse

#add_part(part) ⇒ Object

Add a MIME part.
#body_html ⇒ Object

Body inner HTML.
#color_scheme_mapping_part ⇒ XmlPart^?

Color scheme mapping part.
#color_scheme_mapping_xml ⇒ String^?

Color scheme mapping XML.
#css_styles ⇒ Object

CSS styles from HTML head.
#document_stats ⇒ Hash

Document statistics (paragraphs, tables, images).
#filelist_part ⇒ XmlPart^?

Filelist XML part.
#filelist_xml ⇒ String^?

Filelist XML content.
#footer_html ⇒ String^?

Footer HTML (placeholder).
#header_footer_parts ⇒ Array<HeaderFooterPart>

Header/footer HTML parts.
#header_html ⇒ String^?

Header HTML.
#html ⇒ HtmlPart

The main HTML part.
#image_parts ⇒ Array<ImagePart>

All image parts.
#images ⇒ Hash

Images as filename => decoded data.
#inspect ⇒ Object

Build a summary of the document structure.
#placeholder_html ⇒ String^?

Placeholder header HTML.
#raw_html ⇒ Object

Raw HTML string of the main HTML part.
#raw_html=(value) ⇒ Object
#text ⇒ Object

Text content (stripped of HTML tags).
#theme_part ⇒ ThemePart^?

Theme data part.
#xml_parts ⇒ Array<XmlPart>

All XML parts.

Instance Method Details

#add_part(part) ⇒ `Object`

Add a MIME part

# File 'lib/uniword/mhtml/document.rb', line 158

def add_part(part)
  parts << part
  self
end

#body_html ⇒ `Object`

Body inner HTML



60
61
62

# File 'lib/uniword/mhtml/document.rb', line 60

def body_html
  html_part&.body_html
end

#color_scheme_mapping_part ⇒ `XmlPart`^?

Returns Color scheme mapping part.

Returns:

(XmlPart, nil) —

Color scheme mapping part

# File 'lib/uniword/mhtml/document.rb', line 113

def color_scheme_mapping_part
  parts.find do |p|
    p.is_a?(XmlPart) && p.filename&.include?("colorschememapping")
  end
end

#color_scheme_mapping_xml ⇒ `String`^?

Returns Color scheme mapping XML.

Returns:

(String, nil) —

Color scheme mapping XML



120
121
122

# File 'lib/uniword/mhtml/document.rb', line 120

def color_scheme_mapping_xml
  color_scheme_mapping_part&.decoded_content
end

#css_styles ⇒ `Object`

CSS styles from HTML head



65
66
67

# File 'lib/uniword/mhtml/document.rb', line 65

def css_styles
  html_part&.css_styles
end

#document_stats ⇒ `Hash`

Returns Document statistics (paragraphs, tables, images).

Returns:

(Hash) —

Document statistics (paragraphs, tables, images)

# File 'lib/uniword/mhtml/document.rb', line 170

def document_stats
  html = raw_html || html_content
  if html
    {
      paragraphs: html.scan(/<p[\s>]/i).count,
      tables: html.scan(/<table/i).count,
      images: html.scan(/<img/i).count,
    }
  else
    { paragraphs: 0, tables: 0, images: 0 }
  end
end

#filelist_part ⇒ `XmlPart`^?

Returns Filelist XML part.

Returns:

(XmlPart, nil) —

Filelist XML part



103
104
105

# File 'lib/uniword/mhtml/document.rb', line 103

def filelist_part
  parts.find { |p| p.is_a?(XmlPart) && p.filename == "filelist.xml" }
end

#filelist_xml ⇒ `String`^?

Returns Filelist XML content.

Returns:

(String, nil) —

Filelist XML content



108
109
110

# File 'lib/uniword/mhtml/document.rb', line 108

def filelist_xml
  filelist_part&.decoded_content
end

#footer_html ⇒ `String`^?

Returns Footer HTML (placeholder).

Returns:

(String, nil) —

Footer HTML (placeholder)

# File 'lib/uniword/mhtml/document.rb', line 137

def footer_html
  header_footer_parts.find do |p|
    p.filename&.include?("footer")
  end&.decoded_content
end

#header_footer_parts ⇒ `Array<HeaderFooterPart>`

Returns Header/footer HTML parts.

Returns:

(Array<HeaderFooterPart>) —

Header/footer HTML parts



125
126
127

# File 'lib/uniword/mhtml/document.rb', line 125

def header_footer_parts
  parts.grep(HeaderFooterPart)
end

#header_html ⇒ `String`^?

Returns Header HTML.

Returns:

(String, nil) —

Header HTML

# File 'lib/uniword/mhtml/document.rb', line 130

def header_html
  header_footer_parts.find do |p|
    p.filename&.include?("header")
  end&.decoded_content
end

#html ⇒ `HtmlPart`

Returns The main HTML part.

Returns:

(HtmlPart) —

The main HTML part



43
44
45

# File 'lib/uniword/mhtml/document.rb', line 43

def html
  @html_part
end

#image_parts ⇒ `Array<ImagePart>`

Returns All image parts.

Returns:

(Array<ImagePart>) —

All image parts



93
94
95

# File 'lib/uniword/mhtml/document.rb', line 93

def image_parts
  parts.grep(ImagePart)
end

#images ⇒ `Hash`

Returns Images as filename => decoded data.

Returns:

(Hash) —

Images as filename => decoded data

# File 'lib/uniword/mhtml/document.rb', line 151

def images
  image_parts.each_with_object({}) do |part, hash|
    hash[part.filename] = part.decoded_content if part.filename
  end
end

#inspect ⇒ `Object`

Build a summary of the document structure

# File 'lib/uniword/mhtml/document.rb', line 164

def inspect
  "#<#{self.class} parts=#{parts.length} images=#{image_parts.length} " \
    "xml=#{xml_parts.length} theme=#{theme_part ? 'yes' : 'no'}>"
end

#placeholder_html ⇒ `String`^?

Returns Placeholder header HTML.

Returns:

(String, nil) —

Placeholder header HTML

# File 'lib/uniword/mhtml/document.rb', line 144

def placeholder_html
  header_footer_parts.find do |p|
    p.filename&.include?("plchdr")
  end&.decoded_content
end

#raw_html ⇒ `Object`

Raw HTML string of the main HTML part



48
49
50

# File 'lib/uniword/mhtml/document.rb', line 48

def raw_html
  html_part&.decoded_content
end

#raw_html=(value) ⇒ `Object`

# File 'lib/uniword/mhtml/document.rb', line 52

def raw_html=(value)
  self.html_part ||= HtmlPart.new
  html_part.content_type = "text/html"
  html_part.content_transfer_encoding = "quoted-printable"
  html_part.raw_content = value
end

#text ⇒ `Object`

Text content (stripped of HTML tags)

# File 'lib/uniword/mhtml/document.rb', line 70

def text
  return "" unless raw_html

  raw_html
    .gsub(/<[^>]+>/, " ")
    .gsub("&lt;", "<")
    .gsub("&gt;", ">")
    .gsub("&amp;", "&")
    .gsub("&quot;", '"')
    .gsub("&#39;", "'")
    .gsub("&nbsp;", " ")
    .gsub(/\s+/, " ")
    .strip
end

#theme_part ⇒ `ThemePart`^?

Returns Theme data part.

Returns:

(ThemePart, nil) —

Theme data part



98
99
100

# File 'lib/uniword/mhtml/document.rb', line 98

def theme_part
  parts.find { |p| p.is_a?(ThemePart) }
end

#xml_parts ⇒ `Array<XmlPart>`

Returns All XML parts.

Returns:

(Array<XmlPart>) —

All XML parts



88
89
90

# File 'lib/uniword/mhtml/document.rb', line 88

def xml_parts
  parts.grep(XmlPart)
end

Class: Uniword::Mhtml::Document

Overview

Instance Method Summary collapse

Instance Method Details

#add_part(part) ⇒ Object

#body_html ⇒ Object

#color_scheme_mapping_part ⇒ XmlPart?

#color_scheme_mapping_xml ⇒ String?

#css_styles ⇒ Object

#document_stats ⇒ Hash

#filelist_part ⇒ XmlPart?

#filelist_xml ⇒ String?

#footer_html ⇒ String?

#header_footer_parts ⇒ Array<HeaderFooterPart>

#header_html ⇒ String?

#html ⇒ HtmlPart

#image_parts ⇒ Array<ImagePart>

#images ⇒ Hash

#inspect ⇒ Object

#placeholder_html ⇒ String?

#raw_html ⇒ Object

#raw_html=(value) ⇒ Object

#text ⇒ Object

#theme_part ⇒ ThemePart?

#xml_parts ⇒ Array<XmlPart>

#add_part(part) ⇒ `Object`

#body_html ⇒ `Object`

#color_scheme_mapping_part ⇒ `XmlPart`^?

#color_scheme_mapping_xml ⇒ `String`^?

#css_styles ⇒ `Object`

#document_stats ⇒ `Hash`

#filelist_part ⇒ `XmlPart`^?

#filelist_xml ⇒ `String`^?

#footer_html ⇒ `String`^?

#header_footer_parts ⇒ `Array<HeaderFooterPart>`

#header_html ⇒ `String`^?

#html ⇒ `HtmlPart`

#image_parts ⇒ `Array<ImagePart>`

#images ⇒ `Hash`

#inspect ⇒ `Object`

#placeholder_html ⇒ `String`^?

#raw_html ⇒ `Object`

#raw_html=(value) ⇒ `Object`

#text ⇒ `Object`

#theme_part ⇒ `ThemePart`^?

#xml_parts ⇒ `Array<XmlPart>`