Module: Licensee::ContentHelper

Includes:
Constants, NormalizationMethods, SimilarityMethods
Included in:
License, ProjectFiles::LicenseFile
Defined in:
lib/licensee/content_helper.rb,
lib/licensee/content_helper/constants.rb,
lib/licensee/content_helper/similarity_methods.rb,
lib/licensee/content_helper/normalization_methods.rb

Overview

Text normalization, hashing, wrapping, and similarity helpers for license content.

Defined Under Namespace

Modules: Constants, NormalizationMethods, SimilarityMethods

Constant Summary

Constants included from Constants

Constants::DIGEST, Constants::END_OF_TERMS_REGEX, Constants::NORMALIZATIONS, Constants::REGEXES, Constants::START_REGEX, Constants::STRIP_METHODS, Constants::VARIETAL_WORDS

Class Method Summary collapse

Instance Method Summary collapse

Methods included from SimilarityMethods

#bigram_similarity, #similarity

Methods included from NormalizationMethods

#content_normalized, #content_without_title_and_version, #normalize_content

Class Method Details

.const_missing(const) ⇒ Object

Backwards compatibalize constants to avoid a breaking change



49
50
51
52
# File 'lib/licensee/content_helper.rb', line 49

def self.const_missing(const)
  key = const.to_s.downcase.gsub('_regex', '').to_sym
  REGEXES[key] || super
end

.format_percent(float) ⇒ Object



80
81
82
# File 'lib/licensee/content_helper.rb', line 80

def self.format_percent(float)
  "#{format('%<float>.2f', float: float)}%"
end

.normalize_for_wrapping(text) ⇒ Object



63
64
65
66
67
68
# File 'lib/licensee/content_helper.rb', line 63

def self.normalize_for_wrapping(text)
  text = text.clone
  text.gsub!(REGEXES[:bullet]) { |m| "\n#{m}\n" }
  text.gsub!(/([^\n])\n([^\n])/, '\\1 \\2')
  text
end

.title_regexObject



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/licensee/content_helper.rb', line 84

def self.title_regex
  @title_regex ||= begin
    licenses = Licensee::License.all(hidden: true, pseudo: false)
    titles = licenses.map(&:title_regex)

    # Title regex must include the version to support matching within
    # families, but for sake of normalization, we can be less strict
    without_versions = licenses.map do |license|
      next if license.title == license.name_without_version

      Regexp.new Regexp.escape(license.name_without_version), 'i'
    end
    titles.concat(without_versions.compact)

    /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i
  end
end

.wrap(text, line_width = 80) ⇒ Object

Wrap text to the given line length



55
56
57
58
59
60
61
# File 'lib/licensee/content_helper.rb', line 55

def self.wrap(text, line_width = 80)
  return if text.nil?

  text = normalize_for_wrapping(text)
  wrapped = wrap_lines(text, line_width)
  wrapped.strip
end

.wrap_line(line, line_width) ⇒ Object



74
75
76
77
78
# File 'lib/licensee/content_helper.rb', line 74

def self.wrap_line(line, line_width)
  return line if line =~ REGEXES[:hrs] || line.length <= line_width

  line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
end

.wrap_lines(text, line_width) ⇒ Object



70
71
72
# File 'lib/licensee/content_helper.rb', line 70

def self.wrap_lines(text, line_width)
  text.split("\n").map { |line| wrap_line(line, line_width) }.join("\n")
end

Instance Method Details

#bigramsObject

A set of consecutive word pairs (bigrams) in the license, without duplicates. Unlike wordset, bigrams are order-sensitive, making similarity scores robust against adversarial word scrambling (see GitHub issue #602).



23
24
25
26
27
28
29
# File 'lib/licensee/content_helper.rb', line 23

def bigrams
  @bigrams ||= if words.nil? || words.length < 2
                 Set.new
               else
                 words.each_cons(2).to_set { |a, b| "#{a} #{b}" }
               end
end

#content_hashObject

SHA1 of the normalized content



44
45
46
# File 'lib/licensee/content_helper.rb', line 44

def content_hash
  @content_hash ||= DIGEST.hexdigest content_normalized
end

#lengthObject

Number of characters in the normalized content



32
33
34
35
36
# File 'lib/licensee/content_helper.rb', line 32

def length
  return 0 unless content_normalized

  content_normalized.length
end

#length_delta(other) ⇒ Object

Given another license or project file, calculates the difference in length



39
40
41
# File 'lib/licensee/content_helper.rb', line 39

def length_delta(other)
  (length - other.length).abs
end

#wordsetObject

A set of each word in the license, without duplicates



16
17
18
# File 'lib/licensee/content_helper.rb', line 16

def wordset
  @wordset ||= words&.to_set
end