Class: Coradoc::Input::Html::Cleaner

Inherits:
Object
  • Object
show all
Defined in:
lib/coradoc/html/input/cleaner.rb

Constant Summary collapse

INNER_WHITESPACE_REGEX_1 =

Pre-compiled regexes for performance

/\n stem:\[/
INNER_WHITESPACE_REGEX_2 =
/(stem:\[([^\]]|\\\])*\])\n(?=\S)/
NEWLINES_REGEX =
/\n{3,}/
LEADING_NEWLINE_REGEX =
/\A\n+/
WHITESPACE_REGEX =
/[ \t\r\n]+/
TRAILING_WHITESPACE_REGEX =
/[ \t\r\n]+\z/

Instance Method Summary collapse

Instance Method Details

#clean_headings(string) ⇒ Object



102
103
104
105
106
107
108
109
# File 'lib/coradoc/html/input/cleaner.rb', line 102

def clean_headings(string)
  string.gsub!(%r{<h([1-9])[^>]*></h\1>}, ' ')
  string.gsub!(
    %r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
    '<sup>\\2</sup>'
  )
  string
end

#clean_punctuation_characters(string) ⇒ Object



86
87
88
# File 'lib/coradoc/html/input/cleaner.rb', line 86

def clean_punctuation_characters(string)
  string.gsub(/(\*\*|~~|__)\s([.!?'"])/, '\\1\\2')
end

#clean_tag_borders(string) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/coradoc/html/input/cleaner.rb', line 69

def clean_tag_borders(string)
  result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
    preserve_border_whitespaces(
      match,
      default_border: Coradoc::Html::Input.config.tag_border
    ) do
      match.strip.sub('~~ ', '~~').sub(' ~~', '~~')
    end
  end

  result.gsub(/\s?\[.*?\]\s?/) do |match|
    preserve_border_whitespaces(match) do
      match.strip.sub('[ ', '[').sub(' ]', ']')
    end
  end
end

#preprocess_word_html(string) ⇒ Object



90
91
92
# File 'lib/coradoc/html/input/cleaner.rb', line 90

def preprocess_word_html(string)
  clean_headings(scrub_whitespace(string.dup))
end

#remove_block_leading_newlines(string) ⇒ Object



37
38
39
# File 'lib/coradoc/html/input/cleaner.rb', line 37

def remove_block_leading_newlines(string)
  string.gsub("]\n****\n\n", "]\n****\n")
end

#remove_inner_whitespaces(string) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/coradoc/html/input/cleaner.rb', line 53

def remove_inner_whitespaces(string)
  unless string.nil?
    string.gsub!("\n stem:[", "\nstem:[")
    string.gsub!(INNER_WHITESPACE_REGEX_1, '\\1 ')
    string.gsub!(INNER_WHITESPACE_REGEX_2, '\\1')
  end
  result = +''
  string.each_line do |line|
    result << preserve_border_whitespaces(line) do
      # Use ASCII-only strip to preserve CJK fullwidth spaces
      line.gsub(/\A[ \t\r\n]+/, '').gsub(/[ \t\r\n]+\z/, '').gsub(/[ \t]{2,}/, ' ')
    end
  end
  result
end

#remove_leading_newlines(string) ⇒ Object



49
50
51
# File 'lib/coradoc/html/input/cleaner.rb', line 49

def remove_leading_newlines(string)
  string.gsub(LEADING_NEWLINE_REGEX, '')
end

#remove_newlines(string) ⇒ Object



45
46
47
# File 'lib/coradoc/html/input/cleaner.rb', line 45

def remove_newlines(string)
  string.gsub(NEWLINES_REGEX, "\n\n")
end

#remove_section_attribute_newlines(string) ⇒ Object



41
42
43
# File 'lib/coradoc/html/input/cleaner.rb', line 41

def remove_section_attribute_newlines(string)
  string.gsub("]\n\n==", "]\n==")
end

#scrub_whitespace(string) ⇒ Object



94
95
96
97
98
99
100
# File 'lib/coradoc/html/input/cleaner.rb', line 94

def scrub_whitespace(string)
  string.gsub!(/&nbsp;|&#xA0;| /i, '&#xA0;')
  string = Coradoc.strip_unicode(string)
  string.gsub!(/( +)$/, ' ')
  string.gsub!("\n\n\n\n", "\n\n")
  string
end

#tidy(string) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/coradoc/html/input/cleaner.rb', line 15

def tidy(string)
  return string.transform_values { |i| tidy(i) } if string.is_a? Hash

  result = HtmlConverter.track_time 'Removing inner whitespace' do
    remove_inner_whitespaces(String.new(string))
  end
  result = HtmlConverter.track_time 'Removing newlines' do
    remove_newlines(result)
  end
  result = HtmlConverter.track_time 'Removing leading newlines' do
    remove_leading_newlines(result)
  end
  result = HtmlConverter.track_time 'Cleaning tag borders' do
    clean_tag_borders(result)
  end
  result = HtmlConverter.track_time 'Cleaning punctuation characters' do
    clean_punctuation_characters(result)
  end
  result = remove_block_leading_newlines(result)
  result = remove_section_attribute_newlines(result)
end