Class: Coradoc::ReverseAdoc::Cleaner

Inherits:
Object
  • Object
show all
Defined in:
lib/coradoc/reverse_adoc/cleaner.rb

Instance Method Summary collapse

Instance Method Details

#clean_headings(string) ⇒ Object

following added by me



94
95
96
97
98
99
100
101
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 94

def clean_headings(string)
  string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
  # I don't know why Libre Office is inserting them, but they need to go
  string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
               "<sup>\\2</sup>")
  # I absolutely don't know why Libre Office is rendering superscripts as h1
  string
end

#clean_punctuation_characters(string) ⇒ Object



75
76
77
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 75

def clean_punctuation_characters(string)
  string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2")
end

#clean_tag_borders(string) ⇒ Object

Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 48

def clean_tag_borders(string)
  # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
  # preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
  #   match.strip.sub("** ", "**").sub(" **", "**")
  # end
  # end

  # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
  #   preserve_border_whitespaces(match, default_border: Coradoc::ReverseAdoc.config.tag_border) do
  #     match.strip.sub("__ ", "__").sub(" __", "__")
  #   end
  # end

  result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
    preserve_border_whitespaces(match,
                                default_border: Coradoc::ReverseAdoc.config.tag_border) do
      match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
    end
  end

  result.gsub(/\s?\[.*?\]\s?/) do |match|
    preserve_border_whitespaces(match) do
      match.strip.sub("[ ", "[").sub(" ]", "]")
    end
  end
end

#preprocess_word_html(string) ⇒ Object

preprocesses HTML, rather than postprocessing it



80
81
82
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 80

def preprocess_word_html(string)
  clean_headings(scrub_whitespace(string.dup))
end

#remove_inner_whitespaces(string) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 29

def remove_inner_whitespaces(string)
  unless string.nil?
    string.gsub!(/\n stem:\[/, "\nstem:[")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
  end
  result = +""
  string.each_line do |line|
    result << preserve_border_whitespaces(line) do
      line.strip.gsub(/[ \t]{2,}/, " ")
    end
  end
  result
end

#remove_leading_newlines(string) ⇒ Object



25
26
27
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 25

def remove_leading_newlines(string)
  string.gsub(/\A\n+/, "")
end

#remove_newlines(string) ⇒ Object



21
22
23
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 21

def remove_newlines(string)
  string.gsub(/\n{3,}/, "\n\n")
end

#scrub_whitespace(string) ⇒ Object



84
85
86
87
88
89
90
91
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 84

def scrub_whitespace(string)
  string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
  string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
  string.gsub!(/( +)$/, " ") # line trailing whitespace
  string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
  # string.delete!('?| ')               # Unicode non-breaking spaces, injected as tabs
  string
end

#tidy(string) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/coradoc/reverse_adoc/cleaner.rb', line 3

def tidy(string)
  result = HtmlConverter.track_time "Removing inner whitespace" do
    remove_inner_whitespaces(String.new(string))
  end
  result = HtmlConverter.track_time "Removing newlines" do
    remove_newlines(result)
  end
  result = HtmlConverter.track_time "Removing leading newlines" do
    remove_leading_newlines(result)
  end
  result = HtmlConverter.track_time "Cleaning tag borders" do
    clean_tag_borders(result)
  end
  result = HtmlConverter.track_time "Cleaning punctuation characters" do
    clean_punctuation_characters(result)
  end
end