Class: Uniword::StreamingParser::DocumentSaxHandler

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
lib/uniword/streaming_parser.rb

Overview

SAX handler for streaming document parsing

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document, paragraph_limit = nil, table_limit = nil) ⇒ DocumentSaxHandler

Returns a new instance of DocumentSaxHandler.



68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/uniword/streaming_parser.rb', line 68

def initialize(document, paragraph_limit = nil, table_limit = nil)
  super()
  @document = document
  @paragraph_limit = paragraph_limit
  @table_limit = table_limit
  @paragraph_count = 0
  @table_count = 0

  # State tracking
  @current_element = nil
  @current_paragraph = nil
  @current_run = nil
  @current_text = nil
  @current_table = nil
  @current_row = nil
  @current_cell = nil
  @in_properties = false
  @element_stack = []
end

Instance Attribute Details

#documentObject (readonly)

Returns the value of attribute document.



66
67
68
# File 'lib/uniword/streaming_parser.rb', line 66

def document
  @document
end

Instance Method Details

#characters(string) ⇒ Object

Called when text content is encountered



134
135
136
137
138
# File 'lib/uniword/streaming_parser.rb', line 134

def characters(string)
  return unless @current_text

  @current_text << string
end

#end_element(name) ⇒ Object

Called when an element ends



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/uniword/streaming_parser.rb', line 112

def end_element(name)
  @element_stack.pop

  case name
  when "p"
    end_paragraph
  when "r"
    end_run
  when "t"
    end_text
  when "tbl"
    end_table
  when "tr"
    end_table_row
  when "tc"
    end_table_cell
  when "pPr", "rPr", "tblPr", "trPr", "tcPr"
    @in_properties = false
  end
end

#start_element(name, attributes = []) ⇒ Object

Called when an element starts



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/uniword/streaming_parser.rb', line 89

def start_element(name, attributes = [])
  attrs = attributes.to_h
  @element_stack.push(name)

  case name
  when "p"
    start_paragraph
  when "r"
    start_run
  when "t"
    start_text(attrs)
  when "tbl"
    start_table
  when "tr"
    start_table_row
  when "tc"
    start_table_cell
  when "pPr", "rPr", "tblPr", "trPr", "tcPr"
    @in_properties = true
  end
end