Class: Coradoc::Input::Html::Cleaner
- Inherits:
-
Object
- Object
- Coradoc::Input::Html::Cleaner
- Defined in:
- lib/coradoc/html/input/cleaner.rb
Constant Summary collapse
- INNER_WHITESPACE_REGEX_1 =
Pre-compiled regexes for performance
/\n stem:\[/- INNER_WHITESPACE_REGEX_2 =
/(stem:\[([^\]]|\\\])*\])\n(?=\S)/- NEWLINES_REGEX =
/\n{3,}/- LEADING_NEWLINE_REGEX =
/\A\n+/- WHITESPACE_REGEX =
/[ \t\r\n]+/- TRAILING_WHITESPACE_REGEX =
/[ \t\r\n]+\z/
Instance Method Summary collapse
- #clean_headings(string) ⇒ Object
- #clean_punctuation_characters(string) ⇒ Object
- #clean_tag_borders(string) ⇒ Object
- #preprocess_word_html(string) ⇒ Object
- #remove_block_leading_newlines(string) ⇒ Object
- #remove_inner_whitespaces(string) ⇒ Object
- #remove_leading_newlines(string) ⇒ Object
- #remove_newlines(string) ⇒ Object
- #remove_section_attribute_newlines(string) ⇒ Object
- #scrub_whitespace(string) ⇒ Object
- #tidy(string) ⇒ Object
Instance Method Details
#clean_headings(string) ⇒ Object
102 103 104 105 106 107 108 109 |
# File 'lib/coradoc/html/input/cleaner.rb', line 102 def clean_headings(string) string.gsub!(%r{<h([1-9])[^>]*></h\1>}, ' ') string.gsub!( %r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>}, '<sup>\\2</sup>' ) string end |
#clean_punctuation_characters(string) ⇒ Object
86 87 88 |
# File 'lib/coradoc/html/input/cleaner.rb', line 86 def clean_punctuation_characters(string) string.gsub(/(\*\*|~~|__)\s([.!?'"])/, '\\1\\2') end |
#clean_tag_borders(string) ⇒ Object
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/coradoc/html/input/cleaner.rb', line 69 def clean_tag_borders(string) result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match| preserve_border_whitespaces( match, default_border: Coradoc::Html::Input.config.tag_border ) do match.strip.sub('~~ ', '~~').sub(' ~~', '~~') end end result.gsub(/\s?\[.*?\]\s?/) do |match| preserve_border_whitespaces(match) do match.strip.sub('[ ', '[').sub(' ]', ']') end end end |
#preprocess_word_html(string) ⇒ Object
90 91 92 |
# File 'lib/coradoc/html/input/cleaner.rb', line 90 def preprocess_word_html(string) clean_headings(scrub_whitespace(string.dup)) end |
#remove_block_leading_newlines(string) ⇒ Object
37 38 39 |
# File 'lib/coradoc/html/input/cleaner.rb', line 37 def remove_block_leading_newlines(string) string.gsub("]\n****\n\n", "]\n****\n") end |
#remove_inner_whitespaces(string) ⇒ Object
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/coradoc/html/input/cleaner.rb', line 53 def remove_inner_whitespaces(string) unless string.nil? string.gsub!("\n stem:[", "\nstem:[") string.gsub!(INNER_WHITESPACE_REGEX_1, '\\1 ') string.gsub!(INNER_WHITESPACE_REGEX_2, '\\1') end result = +'' string.each_line do |line| result << preserve_border_whitespaces(line) do # Use ASCII-only strip to preserve CJK fullwidth spaces line.gsub(/\A[ \t\r\n]+/, '').gsub(/[ \t\r\n]+\z/, '').gsub(/[ \t]{2,}/, ' ') end end result end |
#remove_leading_newlines(string) ⇒ Object
49 50 51 |
# File 'lib/coradoc/html/input/cleaner.rb', line 49 def remove_leading_newlines(string) string.gsub(LEADING_NEWLINE_REGEX, '') end |
#remove_newlines(string) ⇒ Object
45 46 47 |
# File 'lib/coradoc/html/input/cleaner.rb', line 45 def remove_newlines(string) string.gsub(NEWLINES_REGEX, "\n\n") end |
#remove_section_attribute_newlines(string) ⇒ Object
41 42 43 |
# File 'lib/coradoc/html/input/cleaner.rb', line 41 def remove_section_attribute_newlines(string) string.gsub("]\n\n==", "]\n==") end |
#scrub_whitespace(string) ⇒ Object
94 95 96 97 98 99 100 |
# File 'lib/coradoc/html/input/cleaner.rb', line 94 def scrub_whitespace(string) string.gsub!(/ | | /i, ' ') string = Coradoc.strip_unicode(string) string.gsub!(/( +)$/, ' ') string.gsub!("\n\n\n\n", "\n\n") string end |
#tidy(string) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/coradoc/html/input/cleaner.rb', line 15 def tidy(string) return string.transform_values { |i| tidy(i) } if string.is_a? Hash result = HtmlConverter.track_time 'Removing inner whitespace' do remove_inner_whitespaces(String.new(string)) end result = HtmlConverter.track_time 'Removing newlines' do remove_newlines(result) end result = HtmlConverter.track_time 'Removing leading newlines' do remove_leading_newlines(result) end result = HtmlConverter.track_time 'Cleaning tag borders' do clean_tag_borders(result) end result = HtmlConverter.track_time 'Cleaning punctuation characters' do clean_punctuation_characters(result) end result = remove_block_leading_newlines(result) result = remove_section_attribute_newlines(result) end |