Class: Uniword::Mhtml::HtmlPart

Inherits:
MimePart
  • Object
show all
Defined in:
lib/uniword/mhtml/html_part.rb

Overview

HTML MIME part — the main document content in an MHTML file.

Contains the Word HTML document with embedded XML metadata (DocumentProperties, WordDocument settings, LatentStyles).

Instance Method Summary collapse

Methods inherited from MimePart

#decoded_content, #decoded_content=, #filename, #html_content?, #image_content?, #text_content?, #theme_content?, #xml_content?

Instance Method Details

#body_htmlObject

Extract the <body> element inner HTML



22
23
24
25
# File 'lib/uniword/mhtml/html_part.rb', line 22

def body_html
  node = html_document.at_css("body")
  node ? node.inner_html : ""
end

#body_inner_htmlObject

Get the body inner HTML



73
74
75
# File 'lib/uniword/mhtml/html_part.rb', line 73

def body_inner_html
  body_html
end

#css_stylesObject

Extract inline CSS styles from <style> tags



28
29
30
# File 'lib/uniword/mhtml/html_part.rb', line 28

def css_styles
  html_document.css("style").map(&:content).join("\n")
end

#document_properties_xmlObject

Extract DocumentProperties XML from HTML head comments.

Returns the <o:DocumentProperties> element as a string with namespace declarations for lutaml-model parsing.



36
37
38
39
# File 'lib/uniword/mhtml/html_part.rb', line 36

def document_properties_xml
  extract_office_xml("DocumentProperties",
                     "urn:schemas-microsoft-com:office:office", "o")
end

#head_htmlObject

Extract the <head> element as string



16
17
18
19
# File 'lib/uniword/mhtml/html_part.rb', line 16

def head_html
  node = html_document.at_css("head")
  node ? node.to_s : ""
end

#html_documentObject

Parse the decoded HTML with Nokogiri



11
12
13
# File 'lib/uniword/mhtml/html_part.rb', line 11

def html_document
  @html_document ||= Nokogiri::HTML(decoded_content)
end

#latent_styles_xmlObject

Extract LatentStyles XML from HTML head comments.



54
55
56
57
# File 'lib/uniword/mhtml/html_part.rb', line 54

def latent_styles_xml
  extract_office_xml("LatentStyles",
                     "urn:schemas-microsoft-com:office:word", "w")
end

#office_document_settings_xmlObject

Extract OfficeDocumentSettings XML from HTML head comments.



42
43
44
45
# File 'lib/uniword/mhtml/html_part.rb', line 42

def office_document_settings_xml
  extract_office_xml("OfficeDocumentSettings",
                     "urn:schemas-microsoft-com:office:office", "o")
end

#to_htmlObject

Get the full HTML string



68
69
70
# File 'lib/uniword/mhtml/html_part.rb', line 68

def to_html
  html_document.to_s
end

#word_document_xmlObject

Extract WordDocument XML from HTML head comments.



48
49
50
51
# File 'lib/uniword/mhtml/html_part.rb', line 48

def word_document_xml
  extract_office_xml("WordDocument",
                     "urn:schemas-microsoft-com:office:word", "w")
end

#xml_blocksObject

Extract all <xml> blocks from head



60
61
62
63
64
65
# File 'lib/uniword/mhtml/html_part.rb', line 60

def xml_blocks
  html_document.at_css("head")&.xpath("comment()")&.filter_map do |comment|
    text = comment.text
    ::Regexp.last_match(1).strip if text =~ %r{<xml>(.*?)</xml>}m
  end || []
end