Class: Uniword::Mhtml::Document

Inherits:
Lutaml::Model::Serializable
  • Object
show all
Includes:
DocumentInput
Defined in:
lib/uniword/mhtml/document.rb

Overview

MHTML Document — top-level model for .mht/.mhtml/.doc files.

This is COMPLETELY SEPARATE from OOXML Wordprocessingml::DocumentRoot. MHTML uses MIME multipart format with HTML content, not ZIP + OOXML XML.

Structure:

Mhtml::Document
  ├── html_part (HtmlPart) — main document HTML
  ├── parts[] (MimePart) — all MIME parts (images, XML, theme, etc.)
  ├── document_properties (Metadata::DocumentProperties)
  ├── word_document_settings (Metadata::WordDocumentSettings)
  └── filelist_xml (String)

Instance Method Summary collapse

Instance Method Details

#add_part(part) ⇒ Object

Add a MIME part



158
159
160
161
# File 'lib/uniword/mhtml/document.rb', line 158

def add_part(part)
  parts << part
  self
end

#body_htmlObject

Body inner HTML



60
61
62
# File 'lib/uniword/mhtml/document.rb', line 60

def body_html
  html_part&.body_html
end

#color_scheme_mapping_partXmlPart?

Returns Color scheme mapping part.

Returns:

  • (XmlPart, nil)

    Color scheme mapping part



113
114
115
116
117
# File 'lib/uniword/mhtml/document.rb', line 113

def color_scheme_mapping_part
  parts.find do |p|
    p.is_a?(XmlPart) && p.filename&.include?("colorschememapping")
  end
end

#color_scheme_mapping_xmlString?

Returns Color scheme mapping XML.

Returns:

  • (String, nil)

    Color scheme mapping XML



120
121
122
# File 'lib/uniword/mhtml/document.rb', line 120

def color_scheme_mapping_xml
  color_scheme_mapping_part&.decoded_content
end

#css_stylesObject

CSS styles from HTML head



65
66
67
# File 'lib/uniword/mhtml/document.rb', line 65

def css_styles
  html_part&.css_styles
end

#document_statsHash

Returns Document statistics (paragraphs, tables, images).

Returns:

  • (Hash)

    Document statistics (paragraphs, tables, images)



170
171
172
173
174
175
176
177
178
179
180
181
# File 'lib/uniword/mhtml/document.rb', line 170

def document_stats
  html = raw_html || html_content
  if html
    {
      paragraphs: html.scan(/<p[\s>]/i).count,
      tables: html.scan(/<table/i).count,
      images: html.scan(/<img/i).count,
    }
  else
    { paragraphs: 0, tables: 0, images: 0 }
  end
end

#filelist_partXmlPart?

Returns Filelist XML part.

Returns:

  • (XmlPart, nil)

    Filelist XML part



103
104
105
# File 'lib/uniword/mhtml/document.rb', line 103

def filelist_part
  parts.find { |p| p.is_a?(XmlPart) && p.filename == "filelist.xml" }
end

#filelist_xmlString?

Returns Filelist XML content.

Returns:

  • (String, nil)

    Filelist XML content



108
109
110
# File 'lib/uniword/mhtml/document.rb', line 108

def filelist_xml
  filelist_part&.decoded_content
end

Returns Footer HTML (placeholder).

Returns:

  • (String, nil)

    Footer HTML (placeholder)



137
138
139
140
141
# File 'lib/uniword/mhtml/document.rb', line 137

def footer_html
  header_footer_parts.find do |p|
    p.filename&.include?("footer")
  end&.decoded_content
end

Returns Header/footer HTML parts.

Returns:



125
126
127
# File 'lib/uniword/mhtml/document.rb', line 125

def header_footer_parts
  parts.grep(HeaderFooterPart)
end

#header_htmlString?

Returns Header HTML.

Returns:

  • (String, nil)

    Header HTML



130
131
132
133
134
# File 'lib/uniword/mhtml/document.rb', line 130

def header_html
  header_footer_parts.find do |p|
    p.filename&.include?("header")
  end&.decoded_content
end

#htmlHtmlPart

Returns The main HTML part.

Returns:



43
44
45
# File 'lib/uniword/mhtml/document.rb', line 43

def html
  @html_part
end

#image_partsArray<ImagePart>

Returns All image parts.

Returns:



93
94
95
# File 'lib/uniword/mhtml/document.rb', line 93

def image_parts
  parts.grep(ImagePart)
end

#imagesHash

Returns Images as filename => decoded data.

Returns:

  • (Hash)

    Images as filename => decoded data



151
152
153
154
155
# File 'lib/uniword/mhtml/document.rb', line 151

def images
  image_parts.each_with_object({}) do |part, hash|
    hash[part.filename] = part.decoded_content if part.filename
  end
end

#inspectObject

Build a summary of the document structure



164
165
166
167
# File 'lib/uniword/mhtml/document.rb', line 164

def inspect
  "#<#{self.class} parts=#{parts.length} images=#{image_parts.length} " \
    "xml=#{xml_parts.length} theme=#{theme_part ? 'yes' : 'no'}>"
end

#placeholder_htmlString?

Returns Placeholder header HTML.

Returns:

  • (String, nil)

    Placeholder header HTML



144
145
146
147
148
# File 'lib/uniword/mhtml/document.rb', line 144

def placeholder_html
  header_footer_parts.find do |p|
    p.filename&.include?("plchdr")
  end&.decoded_content
end

#raw_htmlObject

Raw HTML string of the main HTML part



48
49
50
# File 'lib/uniword/mhtml/document.rb', line 48

def raw_html
  html_part&.decoded_content
end

#raw_html=(value) ⇒ Object



52
53
54
55
56
57
# File 'lib/uniword/mhtml/document.rb', line 52

def raw_html=(value)
  self.html_part ||= HtmlPart.new
  html_part.content_type = "text/html"
  html_part.content_transfer_encoding = "quoted-printable"
  html_part.raw_content = value
end

#textObject

Text content (stripped of HTML tags)



70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/uniword/mhtml/document.rb', line 70

def text
  return "" unless raw_html

  raw_html
    .gsub(/<[^>]+>/, " ")
    .gsub("&lt;", "<")
    .gsub("&gt;", ">")
    .gsub("&amp;", "&")
    .gsub("&quot;", '"')
    .gsub("&#39;", "'")
    .gsub("&nbsp;", " ")
    .gsub(/\s+/, " ")
    .strip
end

#theme_partThemePart?

Returns Theme data part.

Returns:



98
99
100
# File 'lib/uniword/mhtml/document.rb', line 98

def theme_part
  parts.find { |p| p.is_a?(ThemePart) }
end

#xml_partsArray<XmlPart>

Returns All XML parts.

Returns:

  • (Array<XmlPart>)

    All XML parts



88
89
90
# File 'lib/uniword/mhtml/document.rb', line 88

def xml_parts
  parts.grep(XmlPart)
end