Module: Metanorma::Standoc::Text

Included in:
Cleanup
Defined in:
lib/metanorma/cleanup/text.rb

Constant Summary collapse

IGNORE_QUOTES_ELEMENTS =
%w(pre tt sourcecode stem asciimath figure bibdata passthrough
identifier metanorma-extension boilerplate).freeze
PRESERVE_LINEBREAK_ELEMENTS =
%w(pre sourcecode passthrough metanorma-extension stem).freeze
STRIP_LINEBREAK_ELEMENTS =
%w(title name variant-title figure example annotation admonition
note li th td dt dd p quote label annotation
preferred admitted related deprecates field-of-application
usage-info expression pronunciation grammar-value domain
definition termnote termexample modification description
newcontent floating-title).freeze
IGNORE_TEXT_ELEMENTS =
%w(index fn).freeze

Instance Method Summary collapse

Instance Method Details

#ancestor_include?(elem, ancestors) ⇒ Boolean

Returns:

  • (Boolean)


4
5
6
7
# File 'lib/metanorma/cleanup/text.rb', line 4

def ancestor_include?(elem, ancestors)
  path = elem.path.gsub(/\[\d+\]/, "").split(%r{/})[1..-2]
  path.intersect?(ancestors)
end

#block?(elem) ⇒ Boolean

Returns:

  • (Boolean)


132
133
134
135
136
137
138
139
140
# File 'lib/metanorma/cleanup/text.rb', line 132

def block?(elem)
  %w(title name variant-title clause figure annex example introduction
     foreword acknowledgements executivesummary note li th td dt dd p
     quote label abstract preferred admitted related deprecates
     field-of-application usage-info expression pronunciation
     grammar-value domain definition termnote termexample modification
     description newcontent floating-title tab annotation admonition
     callout-annotation).include? elem.name
end

#dumb2smart_quotes(xmldoc) ⇒ Object



146
147
148
149
150
151
152
153
154
155
156
# File 'lib/metanorma/cleanup/text.rb', line 146

def dumb2smart_quotes(xmldoc)
  prev = ""
  xmldoc.traverse do |x|
    block?(x) and prev = ""
    empty_tag_with_text_content?(x) and prev = "dummy"
    x.text? or next
    ancestor_include?(x, IGNORE_QUOTES_ELEMENTS) and next
    dumb2smart_quotes1(x, prev)
    prev = x.text
  end
end

#dumb2smart_quotes1(curr, prev) ⇒ Object



158
159
160
161
162
163
164
# File 'lib/metanorma/cleanup/text.rb', line 158

def dumb2smart_quotes1(curr, prev)
  /[-'"(<>]|\.\.|\dx/.match?(curr.text) or return

  /\A["']/.match?(curr.text) && prev.match?(/\S\Z/) and
    curr.content = curr.text.sub(/\A"/, "").sub(/\A'/, "")
  curr.replace(Metanorma::Utils::smartformat(curr.text))
end

#dumbquote_cleanup(xmldoc) ⇒ Object



166
167
168
169
170
171
172
173
174
175
176
# File 'lib/metanorma/cleanup/text.rb', line 166

def dumbquote_cleanup(xmldoc)
  xmldoc.traverse do |n|
    next unless n.text? && n.text.include?("\u2019")

    n.replace(@c.encode(
                @c.decode(n.text)
      .gsub(/(?<=\p{Alnum})\u2019(?=\p{Alpha})/, "'"),
                :basic, :hexadecimal
              ))
  end
end

#empty_tag_with_text_content?(elem) ⇒ Boolean

Returns:

  • (Boolean)


142
143
144
# File 'lib/metanorma/cleanup/text.rb', line 142

def empty_tag_with_text_content?(elem)
  %w(eref xref termref link).include? elem.name
end

#gather_text_for_linebreak_cleanup(block) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/metanorma/cleanup/text.rb', line 42

def gather_text_for_linebreak_cleanup(block)
  x = gather_text_for_linebreak_cleanup1(block)
  x.empty? and return x
  x.each_with_index do |e, i|
    e[:skip] ||= !e[:text].include?("\n")
    # do not treat stem linebreaks as meaningful
    e[:skip] ||= x[i + 1]&.dig(:stem)
    e[:skip] ||= !e[:elem].text?
  end
  x[-1][:last] = true
  x
end

#gather_text_for_linebreak_cleanup1(block) ⇒ Object



55
56
57
58
59
60
61
62
63
# File 'lib/metanorma/cleanup/text.rb', line 55

def gather_text_for_linebreak_cleanup1(block)
  block.xpath(".//text() | .//eref[not(text())] |  " \
              ".//xref[not(text())] | .//termref[not(text())] | " \
              ".//link[not(text())] ").map do |e|
    # x = block.xpath(".//text()").map do |e|
    { elem: e, text: e.text, stem: ancestor_include?(e, %w(stem)),
      skip: ancestor_include?(e, PRESERVE_LINEBREAK_ELEMENTS) }
  end
end

#ignoretext?(elem) ⇒ Boolean

Returns:

  • (Boolean)


128
129
130
# File 'lib/metanorma/cleanup/text.rb', line 128

def ignoretext?(elem)
  IGNORE_TEXT_ELEMENTS.include? elem.name
end

#linebreak_cleanup(xmldoc) ⇒ Object

process example/p, example/sourcecode, not example on its own: this is about stripping lines for blocks containing inline elems & text



11
12
13
14
15
16
17
18
# File 'lib/metanorma/cleanup/text.rb', line 11

def linebreak_cleanup(xmldoc)
  xmldoc.xpath(STRIP_LINEBREAK_ELEMENTS.map { |e| "//#{e}" }.join(" | "))
    .each do |b|
      b.xpath(STRIP_LINEBREAK_ELEMENTS.map { |e| ".//#{e}" }.join(" | "))
        .empty? or next
      linebreak_cleanup_block(gather_text_for_linebreak_cleanup(b))
    end
end

#linebreak_cleanup_block(block) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
# File 'lib/metanorma/cleanup/text.rb', line 20

def linebreak_cleanup_block(block)
  block.each_with_index do |e, i|
    e[:skip] and next
    lines = lines_strip_textspan(e, block[i + 1])
    out = Metanorma::Utils.line_sanitise(lines)
    e[:last] or out.pop
    /\s$/.match?(e[:text][-1]) or out[-1].rstrip!
    # do not accidentally XML tags when inserting text with &lt; back in to doc
    e[:elem].replace(@c.encode(out.join, :hexadecimal, :basic))
  end
end

#lines_strip_textspan(span, nextspan) ⇒ Object



32
33
34
35
36
37
38
39
40
# File 'lib/metanorma/cleanup/text.rb', line 32

def lines_strip_textspan(span, nextspan)
  lines = []
  span[:text] and
    lines = span[:text].lines[0..-2].map(&:rstrip) <<
      span[:text].lines[-1]&.sub(/\n$/, "")
  # no final line rstrip: can be space linking to next line
  span[:last] or lines << nextspan[:text].lines.first # next token context
  lines
end

#smartquotes_cleanup(xmldoc) ⇒ Object



65
66
67
68
69
70
71
72
73
# File 'lib/metanorma/cleanup/text.rb', line 65

def smartquotes_cleanup(xmldoc)
  xmldoc.xpath("//date").each { |d| Metanorma::Utils::endash_date(d) }
  if @smartquotes then smartquotes_cleanup1(xmldoc)
  else dumbquote_cleanup(xmldoc)
  end
  xmldoc.xpath("//passthrough[@formats = 'straightquotes']").each do |x|
    x.replace(x.children)
  end
end

#smartquotes_cleanup1(xmldoc) ⇒ Object



75
76
77
78
79
# File 'lib/metanorma/cleanup/text.rb', line 75

def smartquotes_cleanup1(xmldoc)
  # prevent normalising boilerplate text twice
  uninterrupt_quotes_around_xml(xmldoc)
  dumb2smart_quotes(xmldoc)
end

#uninterrupt_quotes_around_xml(xmldoc) ⇒ Object

“abc<tag/>”, def => “abc”,<tag/> def



82
83
84
85
86
87
88
89
# File 'lib/metanorma/cleanup/text.rb', line 82

def uninterrupt_quotes_around_xml(xmldoc)
  xmldoc.traverse do |n|
    next unless n.text? && n&.previous&.element?
    next if uninterrupt_quotes_around_xml_skip(n)

    uninterrupt_quotes_around_xml1(n.previous)
  end
end

#uninterrupt_quotes_around_xml1(elem) ⇒ Object

“abc<tag/>”, def => “abc”,<tag/> def



115
116
117
118
119
120
121
122
123
124
# File 'lib/metanorma/cleanup/text.rb', line 115

def uninterrupt_quotes_around_xml1(elem)
  prev = elem.at(".//preceding::text()[1]") or return
  /\S\Z/.match?(prev.text) or return
  foll = elem.at(".//following::text()[1]")
  /"$/.match?(prev.text) and /^"/.match?(foll&.text) and return # "<tag/>"
  m = /\A(["'][[:punct:]]*)(\s|\Z)/
    .match(@c.decode(foll&.text)) or return
  foll.content = foll.text.sub(/\A(["'][[:punct:]]*)/, "")
  prev.content = "#{prev.text}#{m[1]}"
end

#uninterrupt_quotes_around_xml_skip(elem) ⇒ Object



106
107
108
109
110
111
112
# File 'lib/metanorma/cleanup/text.rb', line 106

def uninterrupt_quotes_around_xml_skip(elem)
  !(/\A['"]/.match?(elem.text) &&
  !ancestor_include?(elem.previous, IGNORE_QUOTES_ELEMENTS) &&
    ((elem.previous.text.strip.empty? &&
      !empty_tag_with_text_content?(elem.previous)) ||
     ignoretext?(elem.previous)))
end