10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
# File 'lib/heitt/scanner.rb', line 10
def self.scan(input, database: HEITT::DATABASE, profiles: HEITT::PROFILES, min_entropy: 3.5)
text = File.exist?(File.expand_path(input)) ? File.read(File.expand_path(input)) : input
context_scores = HEITT::Analyzer.analyze(text, profiles: profiles) found = {}
database.each do |entry|
regex = get_regex(entry)
modes = get_modes(entry)
next unless regex && modes && !modes.empty?
pattern = regex.is_a?(Regexp) ? regex : Regexp.new(regex)
scanner = StringScanner.new(text)
while scanner.scan_until(pattern)
matched = scanner.matched
next unless matched.length < 8 || HEITT::Analyzer.high_entropy?(matched, min_entropy)
offset = scanner.pos - matched.length
HEITT::Logger.debug("Extracting prefix..")
delim_prefix = HEITT::Analyzer.(text, offset)
HEITT::Logger.debug("Extracted prefix: #{delim_prefix.length <= 1 ? "NULL" : delim_prefix}")
candidates = HEITT::Analyzer.score_candidates(modes, delim_prefix, context_scores)
found[matched] ||= {hash: matched, candidates: []}
found[matched][:candidates].concat(candidates)
end
end
found.each_value do |result|
result[:candidates] = result[:candidates]
.group_by {|c| c[:name]}
.map {|name, dupes| dupes.max_by {|c| c[:score]}}
.sort_by {|c| -c[:score]}
scores_hash = result[:candidates].map {|c| [c[:name], c[:score]]}.to_h
confidences = Analyzer.assign_confidence(scores_hash)
result[:candidates] = result[:candidates].map {|c| c.merge(confidence: confidences[c[:name]])}
end
found.values
end
|