Class: Coradoc::Html::Cleaner
- Inherits:
-
Object
- Object
- Coradoc::Html::Cleaner
- Defined in:
- lib/coradoc/html/cleaner.rb
Constant Summary collapse
- INNER_WHITESPACE_REGEX_1 =
/\n stem:\[/- INNER_WHITESPACE_REGEX_2 =
/(stem:\[([^\]]|\\\])*\])\n(?=\S)/- NEWLINES_REGEX =
/\n{3,}/- LEADING_NEWLINE_REGEX =
/\A\n+/- WHITESPACE_REGEX =
/[ \t\r\n]+/- TRAILING_WHITESPACE_REGEX =
/[ \t\r\n]+\z/
Instance Method Summary collapse
- #clean_headings(string) ⇒ Object
- #clean_punctuation_characters(string) ⇒ Object
- #clean_tag_borders(string) ⇒ Object
- #preprocess_word_html(string) ⇒ Object
- #remove_block_leading_newlines(string) ⇒ Object
- #remove_inner_whitespaces(string) ⇒ Object
- #remove_leading_newlines(string) ⇒ Object
- #remove_newlines(string) ⇒ Object
- #remove_section_attribute_newlines(string) ⇒ Object
- #scrub_whitespace(string) ⇒ Object
- #tidy(string) ⇒ Object
Instance Method Details
#clean_headings(string) ⇒ Object
99 100 101 102 103 104 105 106 |
# File 'lib/coradoc/html/cleaner.rb', line 99 def clean_headings(string) string.gsub!(%r{<h([1-9])[^>]*></h\1>}, ' ') string.gsub!( %r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>}, '<sup>\\2</sup>' ) string end |
#clean_punctuation_characters(string) ⇒ Object
83 84 85 |
# File 'lib/coradoc/html/cleaner.rb', line 83 def clean_punctuation_characters(string) string.gsub(/(\*\*|~~|__)\s([.!?'"])/, '\\1\\2') end |
#clean_tag_borders(string) ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/coradoc/html/cleaner.rb', line 66 def clean_tag_borders(string) result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match| preserve_border_whitespaces( match, default_border: Html.input_config.tag_border ) do match.strip.sub('~~ ', '~~').sub(' ~~', '~~') end end result.gsub(/\s?\[.*?\]\s?/) do |match| preserve_border_whitespaces(match) do match.strip.sub('[ ', '[').sub(' ]', ']') end end end |
#preprocess_word_html(string) ⇒ Object
87 88 89 |
# File 'lib/coradoc/html/cleaner.rb', line 87 def preprocess_word_html(string) clean_headings(scrub_whitespace(string.dup)) end |
#remove_block_leading_newlines(string) ⇒ Object
35 36 37 |
# File 'lib/coradoc/html/cleaner.rb', line 35 def remove_block_leading_newlines(string) string.gsub("]\n****\n\n", "]\n****\n") end |
#remove_inner_whitespaces(string) ⇒ Object
51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/coradoc/html/cleaner.rb', line 51 def remove_inner_whitespaces(string) unless string.nil? string.gsub!("\n stem:[", "\nstem:[") string.gsub!(INNER_WHITESPACE_REGEX_1, '\\1 ') string.gsub!(INNER_WHITESPACE_REGEX_2, '\\1') end result = +'' string.each_line do |line| result << preserve_border_whitespaces(line) do line.gsub(/\A[ \t\r\n]+/, '').gsub(/[ \t\r\n]+\z/, '').gsub(/[ \t]{2,}/, ' ') end end result end |
#remove_leading_newlines(string) ⇒ Object
47 48 49 |
# File 'lib/coradoc/html/cleaner.rb', line 47 def remove_leading_newlines(string) string.gsub(LEADING_NEWLINE_REGEX, '') end |
#remove_newlines(string) ⇒ Object
43 44 45 |
# File 'lib/coradoc/html/cleaner.rb', line 43 def remove_newlines(string) string.gsub(NEWLINES_REGEX, "\n\n") end |
#remove_section_attribute_newlines(string) ⇒ Object
39 40 41 |
# File 'lib/coradoc/html/cleaner.rb', line 39 def remove_section_attribute_newlines(string) string.gsub("]\n\n==", "]\n==") end |
#scrub_whitespace(string) ⇒ Object
91 92 93 94 95 96 97 |
# File 'lib/coradoc/html/cleaner.rb', line 91 def scrub_whitespace(string) string.gsub!(/ | | /i, ' ') string = Coradoc.strip_unicode(string) string.gsub!(/( +)$/, ' ') string.gsub!("\n\n\n\n", "\n\n") string end |
#tidy(string) ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/coradoc/html/cleaner.rb', line 13 def tidy(string) return string.transform_values { |i| tidy(i) } if string.is_a? Hash result = HtmlConverter.track_time 'Removing inner whitespace' do remove_inner_whitespaces(String.new(string)) end result = HtmlConverter.track_time 'Removing newlines' do remove_newlines(result) end result = HtmlConverter.track_time 'Removing leading newlines' do remove_leading_newlines(result) end result = HtmlConverter.track_time 'Cleaning tag borders' do clean_tag_borders(result) end result = HtmlConverter.track_time 'Cleaning punctuation characters' do clean_punctuation_characters(result) end result = remove_block_leading_newlines(result) result = remove_section_attribute_newlines(result) end |