Class: Coradoc::Input::Html::Cleaner

Inherits:
Object
  • Object
show all
Defined in:
lib/coradoc/html/input/cleaner.rb

Constant Summary collapse

INNER_WHITESPACE_REGEX_1 =

Pre-compiled regexes for performance

/\n stem:\[/
INNER_WHITESPACE_REGEX_2 =
/(stem:\[([^\]]|\\\])*\])\n(?=\S)/
INNER_WHITESPACE_REGEX_3 =
/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/
NEWLINES_REGEX =
/\n{3,}/
LEADING_NEWLINE_REGEX =
/\A\n+/
WHITESPACE_REGEX =
/[ \t\r\n]+/
TRAILING_WHITESPACE_REGEX =
/[ \t\r\n]+\z/
MULTIPLE_WHITESPACE_REGEX =
/[ \t]{2,}/
TAG_BORDER_REGEXES =
{
  asterisk: /\s?\*{2,}/,
  underscore: /\s?_{2,}/,
  tilde: /\s?~{2,}/,
  bracket: /\s?\[.*?\]\s?/
}.freeze

Instance Method Summary collapse

Instance Method Details

#clean_headings(string) ⇒ Object

following added by me



129
130
131
132
133
134
135
136
137
138
# File 'lib/coradoc/html/input/cleaner.rb', line 129

def clean_headings(string)
  string.gsub!(%r{<h([1-9])[^>]*></h\1>}, ' ')
  # I don't know why Libre Office is inserting them, but they need to go
  string.gsub!(
    %r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
    '<sup>\\2</sup>'
  )
  # I absolutely don't know why Libre Office is rendering superscripts as h1
  string
end

#clean_punctuation_characters(string) ⇒ Object



110
111
112
# File 'lib/coradoc/html/input/cleaner.rb', line 110

def clean_punctuation_characters(string)
  string.gsub(/(\*\*|~~|__)\s([.!?'"])/, '\\1\\2')
end

#clean_tag_borders(string) ⇒ Object

Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/coradoc/html/input/cleaner.rb', line 81

def clean_tag_borders(string)
  # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
  # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
  #   match.strip.sub("** ", "**").sub(" **", "**")
  # end
  # end

  # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
  #   preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
  #     match.strip.sub("__ ", "__").sub(" __", "__")
  #   end
  # end

  result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
    preserve_border_whitespaces(
      match,
      default_border: Coradoc::Html::Input.config.tag_border
    ) do
      match.strip.sub('~~ ', '~~').sub(' ~~', '~~')
    end
  end

  result.gsub(/\s?\[.*?\]\s?/) do |match|
    preserve_border_whitespaces(match) do
      match.strip.sub('[ ', '[').sub(' ]', ']')
    end
  end
end

#preprocess_word_html(string) ⇒ Object

preprocesses HTML, rather than postprocessing it



115
116
117
# File 'lib/coradoc/html/input/cleaner.rb', line 115

def preprocess_word_html(string)
  clean_headings(scrub_whitespace(string.dup))
end

#remove_block_leading_newlines(string) ⇒ Object



45
46
47
# File 'lib/coradoc/html/input/cleaner.rb', line 45

def remove_block_leading_newlines(string)
  string.gsub("]\n****\n\n", "]\n****\n")
end

#remove_inner_whitespaces(string) ⇒ Object



61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/coradoc/html/input/cleaner.rb', line 61

def remove_inner_whitespaces(string)
  unless string.nil?
    string.gsub!("\n stem:[", "\nstem:[")
    string.gsub!(INNER_WHITESPACE_REGEX_1, '\\1 ')
    string.gsub!(INNER_WHITESPACE_REGEX_2, '\\1')
  end
  result = +''
  string.each_line do |line|
    result << preserve_border_whitespaces(line) do
      # Use ASCII-only strip to preserve CJK fullwidth spaces
      line.gsub(/\A[ \t\r\n]+/, '').gsub(/[ \t\r\n]+\z/, '').gsub(/[ \t]{2,}/, ' ')
    end
  end
  result
end

#remove_leading_newlines(string) ⇒ Object



57
58
59
# File 'lib/coradoc/html/input/cleaner.rb', line 57

def remove_leading_newlines(string)
  string.gsub(LEADING_NEWLINE_REGEX, '')
end

#remove_newlines(string) ⇒ Object



53
54
55
# File 'lib/coradoc/html/input/cleaner.rb', line 53

def remove_newlines(string)
  string.gsub(NEWLINES_REGEX, "\n\n")
end

#remove_section_attribute_newlines(string) ⇒ Object



49
50
51
# File 'lib/coradoc/html/input/cleaner.rb', line 49

def remove_section_attribute_newlines(string)
  string.gsub("]\n\n==", "]\n==")
end

#scrub_whitespace(string) ⇒ Object



119
120
121
122
123
124
125
126
# File 'lib/coradoc/html/input/cleaner.rb', line 119

def scrub_whitespace(string)
  string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, '&#xA0;') # HTML encoded spaces
  string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
  string.gsub!(/( +)$/, ' ') # line trailing whitespace
  string.gsub!("\n\n\n\n", "\n\n") # Quadruple line breaks
  # string.delete!('?| ')               # Unicode non-breaking spaces, injected as tabs
  string
end

#tidy(string) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/coradoc/html/input/cleaner.rb', line 23

def tidy(string)
  return string.transform_values { |i| tidy(i) } if string.is_a? Hash

  result = HtmlConverter.track_time 'Removing inner whitespace' do
    remove_inner_whitespaces(String.new(string))
  end
  result = HtmlConverter.track_time 'Removing newlines' do
    remove_newlines(result)
  end
  result = HtmlConverter.track_time 'Removing leading newlines' do
    remove_leading_newlines(result)
  end
  result = HtmlConverter.track_time 'Cleaning tag borders' do
    clean_tag_borders(result)
  end
  result = HtmlConverter.track_time 'Cleaning punctuation characters' do
    clean_punctuation_characters(result)
  end
  result = remove_block_leading_newlines(result)
  result = remove_section_attribute_newlines(result)
end