Class: Uniword::Mhtml::Document

Inherits:
Lutaml::Model::Serializable
  • Object
show all
Defined in:
lib/uniword/mhtml/document.rb

Overview

MHTML Document — top-level model for .mht/.mhtml/.doc files.

This is COMPLETELY SEPARATE from OOXML Wordprocessingml::DocumentRoot. MHTML uses MIME multipart format with HTML content, not ZIP + OOXML XML.

Structure:

Mhtml::Document
  ├── html_part (HtmlPart) — main document HTML
  ├── parts[] (MimePart) — all MIME parts (images, XML, theme, etc.)
  ├── document_properties (Metadata::DocumentProperties)
  ├── word_document_settings (Metadata::WordDocumentSettings)
  └── filelist_xml (String)

Instance Method Summary collapse

Instance Method Details

#add_part(part) ⇒ Object

Add a MIME part



154
155
156
157
# File 'lib/uniword/mhtml/document.rb', line 154

def add_part(part)
  parts << part
  self
end

#body_htmlObject

Body inner HTML



56
57
58
# File 'lib/uniword/mhtml/document.rb', line 56

def body_html
  html_part&.body_html
end

#color_scheme_mapping_partXmlPart?

Returns Color scheme mapping part.

Returns:

  • (XmlPart, nil)

    Color scheme mapping part



109
110
111
112
113
# File 'lib/uniword/mhtml/document.rb', line 109

def color_scheme_mapping_part
  parts.find do |p|
    p.is_a?(XmlPart) && p.filename&.include?("colorschememapping")
  end
end

#color_scheme_mapping_xmlString?

Returns Color scheme mapping XML.

Returns:

  • (String, nil)

    Color scheme mapping XML



116
117
118
# File 'lib/uniword/mhtml/document.rb', line 116

def color_scheme_mapping_xml
  color_scheme_mapping_part&.decoded_content
end

#css_stylesObject

CSS styles from HTML head



61
62
63
# File 'lib/uniword/mhtml/document.rb', line 61

def css_styles
  html_part&.css_styles
end

#filelist_partXmlPart?

Returns Filelist XML part.

Returns:

  • (XmlPart, nil)

    Filelist XML part



99
100
101
# File 'lib/uniword/mhtml/document.rb', line 99

def filelist_part
  parts.find { |p| p.is_a?(XmlPart) && p.filename == "filelist.xml" }
end

#filelist_xmlString?

Returns Filelist XML content.

Returns:

  • (String, nil)

    Filelist XML content



104
105
106
# File 'lib/uniword/mhtml/document.rb', line 104

def filelist_xml
  filelist_part&.decoded_content
end

Returns Footer HTML (placeholder).

Returns:

  • (String, nil)

    Footer HTML (placeholder)



133
134
135
136
137
# File 'lib/uniword/mhtml/document.rb', line 133

def footer_html
  header_footer_parts.find do |p|
    p.filename&.include?("footer")
  end&.decoded_content
end

Returns Header/footer HTML parts.

Returns:



121
122
123
# File 'lib/uniword/mhtml/document.rb', line 121

def header_footer_parts
  parts.grep(HeaderFooterPart)
end

#header_htmlString?

Returns Header HTML.

Returns:

  • (String, nil)

    Header HTML



126
127
128
129
130
# File 'lib/uniword/mhtml/document.rb', line 126

def header_html
  header_footer_parts.find do |p|
    p.filename&.include?("header")
  end&.decoded_content
end

#htmlHtmlPart

Returns The main HTML part.

Returns:



39
40
41
# File 'lib/uniword/mhtml/document.rb', line 39

def html
  @html_part
end

#image_partsArray<ImagePart>

Returns All image parts.

Returns:



89
90
91
# File 'lib/uniword/mhtml/document.rb', line 89

def image_parts
  parts.grep(ImagePart)
end

#imagesHash

Returns Images as filename => decoded data.

Returns:

  • (Hash)

    Images as filename => decoded data



147
148
149
150
151
# File 'lib/uniword/mhtml/document.rb', line 147

def images
  image_parts.each_with_object({}) do |part, hash|
    hash[part.filename] = part.decoded_content if part.filename
  end
end

#inspectObject

Build a summary of the document structure



160
161
162
163
# File 'lib/uniword/mhtml/document.rb', line 160

def inspect
  "#<#{self.class} parts=#{parts.length} images=#{image_parts.length} " \
    "xml=#{xml_parts.length} theme=#{theme_part ? 'yes' : 'no'}>"
end

#placeholder_htmlString?

Returns Placeholder header HTML.

Returns:

  • (String, nil)

    Placeholder header HTML



140
141
142
143
144
# File 'lib/uniword/mhtml/document.rb', line 140

def placeholder_html
  header_footer_parts.find do |p|
    p.filename&.include?("plchdr")
  end&.decoded_content
end

#raw_htmlObject

Raw HTML string of the main HTML part



44
45
46
# File 'lib/uniword/mhtml/document.rb', line 44

def raw_html
  html_part&.decoded_content
end

#raw_html=(value) ⇒ Object



48
49
50
51
52
53
# File 'lib/uniword/mhtml/document.rb', line 48

def raw_html=(value)
  self.html_part ||= HtmlPart.new
  html_part.content_type = "text/html"
  html_part.content_transfer_encoding = "quoted-printable"
  html_part.raw_content = value
end

#textObject

Text content (stripped of HTML tags)



66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/uniword/mhtml/document.rb', line 66

def text
  return "" unless raw_html

  raw_html
    .gsub(/<[^>]+>/, " ")
    .gsub("&lt;", "<")
    .gsub("&gt;", ">")
    .gsub("&amp;", "&")
    .gsub("&quot;", '"')
    .gsub("&#39;", "'")
    .gsub("&nbsp;", " ")
    .gsub(/\s+/, " ")
    .strip
end

#theme_partThemePart?

Returns Theme data part.

Returns:



94
95
96
# File 'lib/uniword/mhtml/document.rb', line 94

def theme_part
  parts.find { |p| p.is_a?(ThemePart) }
end

#xml_partsArray<XmlPart>

Returns All XML parts.

Returns:

  • (Array<XmlPart>)

    All XML parts



84
85
86
# File 'lib/uniword/mhtml/document.rb', line 84

def xml_parts
  parts.grep(XmlPart)
end