Class: Coradoc::Input::Html::Cleaner

Inherits:

Object

Object
Coradoc::Input::Html::Cleaner

show all

Defined in:: lib/coradoc/html/input/cleaner.rb

Constant Summary collapse

INNER_WHITESPACE_REGEX_1 = Pre-compiled regexes for performance

/\n stem:\[/

INNER_WHITESPACE_REGEX_2 =

/(stem:\[([^\]]|\\\])*\])\n(?=\S)/

INNER_WHITESPACE_REGEX_3 =

/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/

NEWLINES_REGEX =

/\n{3,}/

LEADING_NEWLINE_REGEX =

/\A\n+/

WHITESPACE_REGEX =

/[ \t\r\n]+/

TRAILING_WHITESPACE_REGEX =

/[ \t\r\n]+\z/

MULTIPLE_WHITESPACE_REGEX =

/[ \t]{2,}/

TAG_BORDER_REGEXES =

{
  asterisk: /\s?\*{2,}/,
  underscore: /\s?_{2,}/,
  tilde: /\s?~{2,}/,
  bracket: /\s?\[.*?\]\s?/
}.freeze

Instance Method Summary collapse

#clean_headings(string) ⇒ Object

following added by me.
#clean_punctuation_characters(string) ⇒ Object
#clean_tag_borders(string) ⇒ Object

Find non-asterisk content that is enclosed by two or more asterisks.
#preprocess_word_html(string) ⇒ Object

preprocesses HTML, rather than postprocessing it.
#remove_block_leading_newlines(string) ⇒ Object
#remove_inner_whitespaces(string) ⇒ Object
#remove_leading_newlines(string) ⇒ Object
#remove_newlines(string) ⇒ Object
#remove_section_attribute_newlines(string) ⇒ Object
#scrub_whitespace(string) ⇒ Object
#tidy(string) ⇒ Object

Instance Method Details

#clean_headings(string) ⇒ `Object`

following added by me

# File 'lib/coradoc/html/input/cleaner.rb', line 129

def clean_headings(string)
  string.gsub!(%r{<h([1-9])[^>]*></h\1>}, ' ')
  # I don't know why Libre Office is inserting them, but they need to go
  string.gsub!(
    %r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
    '<sup>\\2</sup>'
  )
  # I absolutely don't know why Libre Office is rendering superscripts as h1
  string
end

#clean_punctuation_characters(string) ⇒ `Object`



110
111
112

# File 'lib/coradoc/html/input/cleaner.rb', line 110

def clean_punctuation_characters(string)
  string.gsub(/(\*\*|~~|__)\s([.!?'"])/, '\\1\\2')
end

#clean_tag_borders(string) ⇒ `Object`

Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.

# File 'lib/coradoc/html/input/cleaner.rb', line 81

def clean_tag_borders(string)
  # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
  # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
  #   match.strip.sub("** ", "**").sub(" **", "**")
  # end
  # end

  # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
  #   preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
  #     match.strip.sub("__ ", "__").sub(" __", "__")
  #   end
  # end

  result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
    preserve_border_whitespaces(
      match,
      default_border: Coradoc::Html::Input.config.tag_border
    ) do
      match.strip.sub('~~ ', '~~').sub(' ~~', '~~')
    end
  end

  result.gsub(/\s?\[.*?\]\s?/) do |match|
    preserve_border_whitespaces(match) do
      match.strip.sub('[ ', '[').sub(' ]', ']')
    end
  end
end

#preprocess_word_html(string) ⇒ `Object`

preprocesses HTML, rather than postprocessing it



115
116
117

# File 'lib/coradoc/html/input/cleaner.rb', line 115

def preprocess_word_html(string)
  clean_headings(scrub_whitespace(string.dup))
end

#remove_block_leading_newlines(string) ⇒ `Object`



45
46
47

# File 'lib/coradoc/html/input/cleaner.rb', line 45

def remove_block_leading_newlines(string)
  string.gsub("]\n****\n\n", "]\n****\n")
end

#remove_inner_whitespaces(string) ⇒ `Object`

# File 'lib/coradoc/html/input/cleaner.rb', line 61

def remove_inner_whitespaces(string)
  unless string.nil?
    string.gsub!("\n stem:[", "\nstem:[")
    string.gsub!(INNER_WHITESPACE_REGEX_1, '\\1 ')
    string.gsub!(INNER_WHITESPACE_REGEX_2, '\\1')
  end
  result = +''
  string.each_line do |line|
    result << preserve_border_whitespaces(line) do
      # Use ASCII-only strip to preserve CJK fullwidth spaces
      line.gsub(/\A[ \t\r\n]+/, '').gsub(/[ \t\r\n]+\z/, '').gsub(/[ \t]{2,}/, ' ')
    end
  end
  result
end

#remove_leading_newlines(string) ⇒ `Object`



57
58
59

# File 'lib/coradoc/html/input/cleaner.rb', line 57

def remove_leading_newlines(string)
  string.gsub(LEADING_NEWLINE_REGEX, '')
end

#remove_newlines(string) ⇒ `Object`



53
54
55

# File 'lib/coradoc/html/input/cleaner.rb', line 53

def remove_newlines(string)
  string.gsub(NEWLINES_REGEX, "\n\n")
end

#remove_section_attribute_newlines(string) ⇒ `Object`



49
50
51

# File 'lib/coradoc/html/input/cleaner.rb', line 49

def remove_section_attribute_newlines(string)
  string.gsub("]\n\n==", "]\n==")
end

#scrub_whitespace(string) ⇒ `Object`

# File 'lib/coradoc/html/input/cleaner.rb', line 119

def scrub_whitespace(string)
  string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, '&#xA0;') # HTML encoded spaces
  string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
  string.gsub!(/( +)$/, ' ') # line trailing whitespace
  string.gsub!("\n\n\n\n", "\n\n") # Quadruple line breaks
  # string.delete!('?| ')               # Unicode non-breaking spaces, injected as tabs
  string
end

#tidy(string) ⇒ `Object`

# File 'lib/coradoc/html/input/cleaner.rb', line 23

def tidy(string)
  return string.transform_values { |i| tidy(i) } if string.is_a? Hash

  result = HtmlConverter.track_time 'Removing inner whitespace' do
    remove_inner_whitespaces(String.new(string))
  end
  result = HtmlConverter.track_time 'Removing newlines' do
    remove_newlines(result)
  end
  result = HtmlConverter.track_time 'Removing leading newlines' do
    remove_leading_newlines(result)
  end
  result = HtmlConverter.track_time 'Cleaning tag borders' do
    clean_tag_borders(result)
  end
  result = HtmlConverter.track_time 'Cleaning punctuation characters' do
    clean_punctuation_characters(result)
  end
  result = remove_block_leading_newlines(result)
  result = remove_section_attribute_newlines(result)
end

Class: Coradoc::Input::Html::Cleaner

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#clean_headings(string) ⇒ Object

#clean_punctuation_characters(string) ⇒ Object

#clean_tag_borders(string) ⇒ Object

#preprocess_word_html(string) ⇒ Object

#remove_block_leading_newlines(string) ⇒ Object

#remove_inner_whitespaces(string) ⇒ Object

#remove_leading_newlines(string) ⇒ Object

#remove_newlines(string) ⇒ Object

#remove_section_attribute_newlines(string) ⇒ Object

#scrub_whitespace(string) ⇒ Object

#tidy(string) ⇒ Object