Class: SemanticTextChunker::Splitters::SentenceSplitter

Inherits:
Object
  • Object
show all
Defined in:
lib/semantic_text_chunker/splitters/sentence_splitter.rb

Constant Summary collapse

ABBREVS =
%w[Mr Mrs Dr Prof Sr Jr vs etc e.g i.e U.S U.K U.S.A Fig Vol No].freeze
ABBREV_PATTERN =
/\b(#{ABBREVS.map { |a| Regexp.escape(a) }.join("|")})\.\s/

Instance Method Summary collapse

Instance Method Details

#split(text) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
# File 'lib/semantic_text_chunker/splitters/sentence_splitter.rb', line 7

def split(text)
  # Temporarily replace abbreviation periods
  protected = text.gsub(ABBREV_PATTERN) { "#{$1}__ABBREV__ " }

  sentences = protected
    .split(/(?<=[.?!])\s+(?=[A-Z])/)
    .map { |s| s.gsub("__ABBREV__", ".").strip }
    .reject(&:empty?)

  sentences
end