Class: Coradoc::Input::Html::Cleaner
- Inherits:
-
Object
- Object
- Coradoc::Input::Html::Cleaner
- Defined in:
- lib/coradoc/html/input/cleaner.rb
Constant Summary collapse
- INNER_WHITESPACE_REGEX_1 =
Pre-compiled regexes for performance
/\n stem:\[/- INNER_WHITESPACE_REGEX_2 =
/(stem:\[([^\]]|\\\])*\])\n(?=\S)/- INNER_WHITESPACE_REGEX_3 =
/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/- NEWLINES_REGEX =
/\n{3,}/- LEADING_NEWLINE_REGEX =
/\A\n+/- WHITESPACE_REGEX =
/[ \t\r\n]+/- TRAILING_WHITESPACE_REGEX =
/[ \t\r\n]+\z/- MULTIPLE_WHITESPACE_REGEX =
/[ \t]{2,}/- TAG_BORDER_REGEXES =
{ asterisk: /\s?\*{2,}/, underscore: /\s?_{2,}/, tilde: /\s?~{2,}/, bracket: /\s?\[.*?\]\s?/ }.freeze
Instance Method Summary collapse
-
#clean_headings(string) ⇒ Object
following added by me.
- #clean_punctuation_characters(string) ⇒ Object
-
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks.
-
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it.
- #remove_block_leading_newlines(string) ⇒ Object
- #remove_inner_whitespaces(string) ⇒ Object
- #remove_leading_newlines(string) ⇒ Object
- #remove_newlines(string) ⇒ Object
- #remove_section_attribute_newlines(string) ⇒ Object
- #scrub_whitespace(string) ⇒ Object
- #tidy(string) ⇒ Object
Instance Method Details
#clean_headings(string) ⇒ Object
following added by me
129 130 131 132 133 134 135 136 137 138 |
# File 'lib/coradoc/html/input/cleaner.rb', line 129 def clean_headings(string) string.gsub!(%r{<h([1-9])[^>]*></h\1>}, ' ') # I don't know why Libre Office is inserting them, but they need to go string.gsub!( %r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>}, '<sup>\\2</sup>' ) # I absolutely don't know why Libre Office is rendering superscripts as h1 string end |
#clean_punctuation_characters(string) ⇒ Object
110 111 112 |
# File 'lib/coradoc/html/input/cleaner.rb', line 110 def clean_punctuation_characters(string) string.gsub(/(\*\*|~~|__)\s([.!?'"])/, '\\1\\2') end |
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/coradoc/html/input/cleaner.rb', line 81 def clean_tag_borders(string) # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match| # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do # match.strip.sub("** ", "**").sub(" **", "**") # end # end # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match| # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do # match.strip.sub("__ ", "__").sub(" __", "__") # end # end result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match| preserve_border_whitespaces( match, default_border: Coradoc::Html::Input.config.tag_border ) do match.strip.sub('~~ ', '~~').sub(' ~~', '~~') end end result.gsub(/\s?\[.*?\]\s?/) do |match| preserve_border_whitespaces(match) do match.strip.sub('[ ', '[').sub(' ]', ']') end end end |
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it
115 116 117 |
# File 'lib/coradoc/html/input/cleaner.rb', line 115 def preprocess_word_html(string) clean_headings(scrub_whitespace(string.dup)) end |
#remove_block_leading_newlines(string) ⇒ Object
45 46 47 |
# File 'lib/coradoc/html/input/cleaner.rb', line 45 def remove_block_leading_newlines(string) string.gsub("]\n****\n\n", "]\n****\n") end |
#remove_inner_whitespaces(string) ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/coradoc/html/input/cleaner.rb', line 61 def remove_inner_whitespaces(string) unless string.nil? string.gsub!("\n stem:[", "\nstem:[") string.gsub!(INNER_WHITESPACE_REGEX_1, '\\1 ') string.gsub!(INNER_WHITESPACE_REGEX_2, '\\1') end result = +'' string.each_line do |line| result << preserve_border_whitespaces(line) do # Use ASCII-only strip to preserve CJK fullwidth spaces line.gsub(/\A[ \t\r\n]+/, '').gsub(/[ \t\r\n]+\z/, '').gsub(/[ \t]{2,}/, ' ') end end result end |
#remove_leading_newlines(string) ⇒ Object
57 58 59 |
# File 'lib/coradoc/html/input/cleaner.rb', line 57 def remove_leading_newlines(string) string.gsub(LEADING_NEWLINE_REGEX, '') end |
#remove_newlines(string) ⇒ Object
53 54 55 |
# File 'lib/coradoc/html/input/cleaner.rb', line 53 def remove_newlines(string) string.gsub(NEWLINES_REGEX, "\n\n") end |
#remove_section_attribute_newlines(string) ⇒ Object
49 50 51 |
# File 'lib/coradoc/html/input/cleaner.rb', line 49 def remove_section_attribute_newlines(string) string.gsub("]\n\n==", "]\n==") end |
#scrub_whitespace(string) ⇒ Object
119 120 121 122 123 124 125 126 |
# File 'lib/coradoc/html/input/cleaner.rb', line 119 def scrub_whitespace(string) string.gsub!(/ | |\u00a0/i, ' ') # HTML encoded spaces string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace string.gsub!(/( +)$/, ' ') # line trailing whitespace string.gsub!("\n\n\n\n", "\n\n") # Quadruple line breaks # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs string end |
#tidy(string) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/coradoc/html/input/cleaner.rb', line 23 def tidy(string) return string.transform_values { |i| tidy(i) } if string.is_a? Hash result = HtmlConverter.track_time 'Removing inner whitespace' do remove_inner_whitespaces(String.new(string)) end result = HtmlConverter.track_time 'Removing newlines' do remove_newlines(result) end result = HtmlConverter.track_time 'Removing leading newlines' do remove_leading_newlines(result) end result = HtmlConverter.track_time 'Cleaning tag borders' do clean_tag_borders(result) end result = HtmlConverter.track_time 'Cleaning punctuation characters' do clean_punctuation_characters(result) end result = remove_block_leading_newlines(result) result = remove_section_attribute_newlines(result) end |