Class: Coradoc::ReverseAdoc::Cleaner
- Inherits:
-
Object
- Object
- Coradoc::ReverseAdoc::Cleaner
- Defined in:
- lib/coradoc/reverse_adoc/cleaner.rb
Instance Method Summary collapse
-
#clean_headings(string) ⇒ Object
following added by me.
- #clean_punctuation_characters(string) ⇒ Object
-
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks.
-
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it.
- #remove_inner_whitespaces(string) ⇒ Object
- #remove_leading_newlines(string) ⇒ Object
- #remove_newlines(string) ⇒ Object
- #scrub_whitespace(string) ⇒ Object
- #tidy(string) ⇒ Object
Instance Method Details
#clean_headings(string) ⇒ Object
following added by me
95 96 97 98 99 100 101 102 |
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 95 def clean_headings(string) string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ") # I don't know why Libre Office is inserting them, but they need to go string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>}, "<sup>\\2</sup>") # I absolutely don't know why Libre Office is rendering superscripts as h1 string end |
#clean_punctuation_characters(string) ⇒ Object
75 76 77 |
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 75 def clean_punctuation_characters(string) string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2") end |
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 48 def clean_tag_borders(string) # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match| # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do # match.strip.sub("** ", "**").sub(" **", "**") # end # end # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match| # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do # match.strip.sub("__ ", "__").sub(" __", "__") # end # end result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match| preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do match.strip.sub("~~ ", "~~").sub(" ~~", "~~") end end result.gsub(/\s?\[.*?\]\s?/) do |match| preserve_border_whitespaces(match) do match.strip.sub("[ ", "[").sub(" ]", "]") end end end |
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it
80 81 82 |
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 80 def preprocess_word_html(string) clean_headings(scrub_whitespace(string.dup)) end |
#remove_inner_whitespaces(string) ⇒ Object
29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 29 def remove_inner_whitespaces(string) unless string.nil? string.gsub!(/\n stem:\[/, "\nstem:[") string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ") string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1") end result = +"" string.each_line do |line| result << preserve_border_whitespaces(line) do line.strip.gsub(/[ \t]{2,}/, " ") end end result end |
#remove_leading_newlines(string) ⇒ Object
25 26 27 |
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 25 def remove_leading_newlines(string) string.gsub(/\A\n+/, "") end |
#remove_newlines(string) ⇒ Object
21 22 23 |
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 21 def remove_newlines(string) string.gsub(/\n{3,}/, "\n\n") end |
#scrub_whitespace(string) ⇒ Object
84 85 86 87 88 89 90 91 92 |
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 84 def scrub_whitespace(string) string.gsub!(/ | |\u00a0/i, " ") # HTML encoded spaces string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace string.gsub!(/( +)$/, " ") # line trailing whitespace string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs string end |
#tidy(string) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 3 def tidy(string) result = HtmlConverter.track_time "Removing inner whitespace" do remove_inner_whitespaces(String.new(string)) end result = HtmlConverter.track_time "Removing newlines" do remove_newlines(result) end result = HtmlConverter.track_time "Removing leading newlines" do remove_leading_newlines(result) end result = HtmlConverter.track_time "Cleaning tag borders" do clean_tag_borders(result) end result = HtmlConverter.track_time "Cleaning punctuation characters" do clean_punctuation_characters(result) end end |