Module: Metanorma::Standoc::Text
- Included in:
- Cleanup
- Defined in:
- lib/metanorma/cleanup/text.rb
Constant Summary collapse
- IGNORE_QUOTES_ELEMENTS =
%w(pre tt sourcecode stem asciimath figure bibdata passthrough identifier metanorma-extension boilerplate).freeze
- PRESERVE_LINEBREAK_ELEMENTS =
%w(pre sourcecode passthrough metanorma-extension stem).freeze
- STRIP_LINEBREAK_ELEMENTS =
%w(title name variant-title figure example annotation admonition note li th td dt dd p quote label annotation preferred admitted related deprecates field-of-application usage-info expression pronunciation grammar-value domain definition termnote termexample modification description newcontent floating-title).freeze
- IGNORE_TEXT_ELEMENTS =
%w(index fn).freeze
Instance Method Summary collapse
- #ancestor_include?(elem, ancestors) ⇒ Boolean
- #block?(elem) ⇒ Boolean
- #dumb2smart_quotes(xmldoc) ⇒ Object
- #dumb2smart_quotes1(curr, prev) ⇒ Object
- #dumbquote_cleanup(xmldoc) ⇒ Object
- #empty_tag_with_text_content?(elem) ⇒ Boolean
- #gather_text_for_linebreak_cleanup(block) ⇒ Object
- #gather_text_for_linebreak_cleanup1(block) ⇒ Object
- #ignoretext?(elem) ⇒ Boolean
-
#linebreak_cleanup(xmldoc) ⇒ Object
process example/p, example/sourcecode, not example on its own: this is about stripping lines for blocks containing inline elems & text.
- #linebreak_cleanup_block(block) ⇒ Object
- #lines_strip_textspan(span, nextspan) ⇒ Object
- #smartquotes_cleanup(xmldoc) ⇒ Object
- #smartquotes_cleanup1(xmldoc) ⇒ Object
-
#uninterrupt_quotes_around_xml(xmldoc) ⇒ Object
“abc<tag/>”, def => “abc”,<tag/> def.
-
#uninterrupt_quotes_around_xml1(elem) ⇒ Object
“abc<tag/>”, def => “abc”,<tag/> def.
- #uninterrupt_quotes_around_xml_skip(elem) ⇒ Object
Instance Method Details
#ancestor_include?(elem, ancestors) ⇒ Boolean
4 5 6 7 |
# File 'lib/metanorma/cleanup/text.rb', line 4 def ancestor_include?(elem, ancestors) path = elem.path.gsub(/\[\d+\]/, "").split(%r{/})[1..-2] path.intersect?(ancestors) end |
#block?(elem) ⇒ Boolean
132 133 134 135 136 137 138 139 140 |
# File 'lib/metanorma/cleanup/text.rb', line 132 def block?(elem) %w(title name variant-title clause figure annex example introduction foreword acknowledgements executivesummary note li th td dt dd p quote label abstract preferred admitted related deprecates field-of-application usage-info expression pronunciation grammar-value domain definition termnote termexample modification description newcontent floating-title tab annotation admonition callout-annotation).include? elem.name end |
#dumb2smart_quotes(xmldoc) ⇒ Object
146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/metanorma/cleanup/text.rb', line 146 def dumb2smart_quotes(xmldoc) prev = "" xmldoc.traverse do |x| block?(x) and prev = "" empty_tag_with_text_content?(x) and prev = "dummy" x.text? or next ancestor_include?(x, IGNORE_QUOTES_ELEMENTS) and next dumb2smart_quotes1(x, prev) prev = x.text end end |
#dumb2smart_quotes1(curr, prev) ⇒ Object
158 159 160 161 162 163 164 |
# File 'lib/metanorma/cleanup/text.rb', line 158 def dumb2smart_quotes1(curr, prev) /[-'"(<>]|\.\.|\dx/.match?(curr.text) or return /\A["']/.match?(curr.text) && prev.match?(/\S\Z/) and curr.content = curr.text.sub(/\A"/, "”").sub(/\A'/, "‘") curr.replace(Metanorma::Utils::smartformat(curr.text)) end |
#dumbquote_cleanup(xmldoc) ⇒ Object
166 167 168 169 170 171 172 173 174 175 176 |
# File 'lib/metanorma/cleanup/text.rb', line 166 def dumbquote_cleanup(xmldoc) xmldoc.traverse do |n| next unless n.text? && n.text.include?("\u2019") n.replace(@c.encode( @c.decode(n.text) .gsub(/(?<=\p{Alnum})\u2019(?=\p{Alpha})/, "'"), :basic, :hexadecimal )) end end |
#empty_tag_with_text_content?(elem) ⇒ Boolean
142 143 144 |
# File 'lib/metanorma/cleanup/text.rb', line 142 def empty_tag_with_text_content?(elem) %w(eref xref termref link).include? elem.name end |
#gather_text_for_linebreak_cleanup(block) ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/metanorma/cleanup/text.rb', line 42 def gather_text_for_linebreak_cleanup(block) x = gather_text_for_linebreak_cleanup1(block) x.empty? and return x x.each_with_index do |e, i| e[:skip] ||= !e[:text].include?("\n") # do not treat stem linebreaks as meaningful e[:skip] ||= x[i + 1]&.dig(:stem) e[:skip] ||= !e[:elem].text? end x[-1][:last] = true x end |
#gather_text_for_linebreak_cleanup1(block) ⇒ Object
55 56 57 58 59 60 61 62 63 |
# File 'lib/metanorma/cleanup/text.rb', line 55 def gather_text_for_linebreak_cleanup1(block) block.xpath(".//text() | .//eref[not(text())] | " \ ".//xref[not(text())] | .//termref[not(text())] | " \ ".//link[not(text())] ").map do |e| # x = block.xpath(".//text()").map do |e| { elem: e, text: e.text, stem: ancestor_include?(e, %w(stem)), skip: ancestor_include?(e, PRESERVE_LINEBREAK_ELEMENTS) } end end |
#ignoretext?(elem) ⇒ Boolean
128 129 130 |
# File 'lib/metanorma/cleanup/text.rb', line 128 def ignoretext?(elem) IGNORE_TEXT_ELEMENTS.include? elem.name end |
#linebreak_cleanup(xmldoc) ⇒ Object
process example/p, example/sourcecode, not example on its own: this is about stripping lines for blocks containing inline elems & text
11 12 13 14 15 16 17 18 |
# File 'lib/metanorma/cleanup/text.rb', line 11 def linebreak_cleanup(xmldoc) xmldoc.xpath(STRIP_LINEBREAK_ELEMENTS.map { |e| "//#{e}" }.join(" | ")) .each do |b| b.xpath(STRIP_LINEBREAK_ELEMENTS.map { |e| ".//#{e}" }.join(" | ")) .empty? or next linebreak_cleanup_block(gather_text_for_linebreak_cleanup(b)) end end |
#linebreak_cleanup_block(block) ⇒ Object
20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/metanorma/cleanup/text.rb', line 20 def linebreak_cleanup_block(block) block.each_with_index do |e, i| e[:skip] and next lines = lines_strip_textspan(e, block[i + 1]) out = Metanorma::Utils.line_sanitise(lines) e[:last] or out.pop /\s$/.match?(e[:text][-1]) or out[-1].rstrip! # do not accidentally XML tags when inserting text with < back in to doc e[:elem].replace(@c.encode(out.join, :hexadecimal, :basic)) end end |
#lines_strip_textspan(span, nextspan) ⇒ Object
32 33 34 35 36 37 38 39 40 |
# File 'lib/metanorma/cleanup/text.rb', line 32 def lines_strip_textspan(span, nextspan) lines = [] span[:text] and lines = span[:text].lines[0..-2].map(&:rstrip) << span[:text].lines[-1]&.sub(/\n$/, "") # no final line rstrip: can be space linking to next line span[:last] or lines << nextspan[:text].lines.first # next token context lines end |
#smartquotes_cleanup(xmldoc) ⇒ Object
65 66 67 68 69 70 71 72 73 |
# File 'lib/metanorma/cleanup/text.rb', line 65 def smartquotes_cleanup(xmldoc) xmldoc.xpath("//date").each { |d| Metanorma::Utils::endash_date(d) } if @smartquotes then smartquotes_cleanup1(xmldoc) else dumbquote_cleanup(xmldoc) end xmldoc.xpath("//passthrough[@formats = 'straightquotes']").each do |x| x.replace(x.children) end end |
#smartquotes_cleanup1(xmldoc) ⇒ Object
75 76 77 78 79 |
# File 'lib/metanorma/cleanup/text.rb', line 75 def smartquotes_cleanup1(xmldoc) # prevent normalising boilerplate text twice uninterrupt_quotes_around_xml(xmldoc) dumb2smart_quotes(xmldoc) end |
#uninterrupt_quotes_around_xml(xmldoc) ⇒ Object
“abc<tag/>”, def => “abc”,<tag/> def
82 83 84 85 86 87 88 89 |
# File 'lib/metanorma/cleanup/text.rb', line 82 def uninterrupt_quotes_around_xml(xmldoc) xmldoc.traverse do |n| next unless n.text? && n&.previous&.element? next if uninterrupt_quotes_around_xml_skip(n) uninterrupt_quotes_around_xml1(n.previous) end end |
#uninterrupt_quotes_around_xml1(elem) ⇒ Object
“abc<tag/>”, def => “abc”,<tag/> def
115 116 117 118 119 120 121 122 123 124 |
# File 'lib/metanorma/cleanup/text.rb', line 115 def uninterrupt_quotes_around_xml1(elem) prev = elem.at(".//preceding::text()[1]") or return /\S\Z/.match?(prev.text) or return foll = elem.at(".//following::text()[1]") /"$/.match?(prev.text) and /^"/.match?(foll&.text) and return # "<tag/>" m = /\A(["'][[:punct:]]*)(\s|\Z)/ .match(@c.decode(foll&.text)) or return foll.content = foll.text.sub(/\A(["'][[:punct:]]*)/, "") prev.content = "#{prev.text}#{m[1]}" end |
#uninterrupt_quotes_around_xml_skip(elem) ⇒ Object
106 107 108 109 110 111 112 |
# File 'lib/metanorma/cleanup/text.rb', line 106 def uninterrupt_quotes_around_xml_skip(elem) !(/\A['"]/.match?(elem.text) && !ancestor_include?(elem.previous, IGNORE_QUOTES_ELEMENTS) && ((elem.previous.text.strip.empty? && !empty_tag_with_text_content?(elem.previous)) || ignoretext?(elem.previous))) end |