philiprehberger-encoding_kit

Tests Gem Version Last updated

Character encoding detection, conversion, and normalization

Requirements

  • Ruby >= 3.1

Installation

Add to your Gemfile:

gem "philiprehberger-encoding_kit"

Or install directly:

gem install philiprehberger-encoding_kit

Usage

require "philiprehberger/encoding_kit"

result = Philiprehberger::EncodingKit.detect(raw_bytes)
result.encoding   # => Encoding::UTF_8
result.confidence # => 0.9
utf8 = Philiprehberger::EncodingKit.to_utf8(raw_bytes)

Encoding Detection with Confidence

require "philiprehberger/encoding_kit"

# Returns a DetectionResult that delegates to Encoding
result = Philiprehberger::EncodingKit.detect("\xEF\xBB\xBFhello".b)
result == Encoding::UTF_8  # => true (backward compatible)
result.confidence          # => 1.0 (BOM detected)
result.name                # => "UTF-8"
result.to_h                # => {encoding: Encoding::UTF_8, confidence: 1.0}

# Heuristic detection returns lower confidence
result = Philiprehberger::EncodingKit.detect("caf\xC3\xA9".b)
result.confidence # => 0.85-0.9

Streaming Detection

require "philiprehberger/encoding_kit"

File.open("data.csv", "rb") do |file|
  result = Philiprehberger::EncodingKit.detect_stream(file, sample_size: 8192)
  result.encoding   # => Encoding::UTF_8
  result.confidence # => 0.9
end

Encoding Analysis

require "philiprehberger/encoding_kit"

analysis = Philiprehberger::EncodingKit.analyze(raw_bytes)
analysis[:encoding]       # => Encoding::UTF_8
analysis[:confidence]     # => 0.9
analysis[:printable_ratio] # => 0.95
analysis[:ascii_ratio]    # => 0.8
analysis[:high_bytes]     # => 12
analysis[:candidates]     # => [{encoding: Encoding::UTF_8, confidence: 0.9}, ...]

Transcode

require "philiprehberger/encoding_kit"

# Auto-detect source, convert to UTF-8
utf8 = Philiprehberger::EncodingKit.transcode(raw_bytes)

# Convert to a specific encoding
latin1 = Philiprehberger::EncodingKit.transcode(utf8_string, to: Encoding::ISO_8859_1)

# Custom fallback behavior
result = Philiprehberger::EncodingKit.transcode(data, to: "UTF-8", fallback: :replace, replace: "?")

Convert to UTF-8

require "philiprehberger/encoding_kit"

# Auto-detect source encoding
utf8 = Philiprehberger::EncodingKit.to_utf8(raw_bytes)

# Specify source encoding
utf8 = Philiprehberger::EncodingKit.to_utf8(latin1_string, from: Encoding::ISO_8859_1)

Normalize

require "philiprehberger/encoding_kit"

# Replace invalid/undefined bytes with U+FFFD
clean = Philiprehberger::EncodingKit.normalize("hello \xFF world".b)

Convert Between Encodings

require "philiprehberger/encoding_kit"

latin1 = Philiprehberger::EncodingKit.convert(utf8_string, from: Encoding::UTF_8, to: Encoding::ISO_8859_1)

BOM Handling

require "philiprehberger/encoding_kit"

Philiprehberger::EncodingKit.bom?("\xEF\xBB\xBFhello")       # => true
Philiprehberger::EncodingKit.strip_bom("\xEF\xBB\xBFhello")  # => "hello"

File Operations

require "philiprehberger/encoding_kit"

# Detect a file's encoding
result = Philiprehberger::EncodingKit.detect_file("data.csv")
result.encoding   # => Encoding::UTF_8
result.confidence # => 0.9

# Read a file as UTF-8 (auto-detects source encoding)
content = Philiprehberger::EncodingKit.read_as_utf8("legacy.txt")
content.encoding # => Encoding::UTF_8

# Read with explicit source encoding
content = Philiprehberger::EncodingKit.read_as_utf8("latin1.txt", from: Encoding::ISO_8859_1)

# Check if a file's encoding is valid
Philiprehberger::EncodingKit.file_valid?("data.csv", encoding: Encoding::UTF_8)  # => true

# Guess encoding from a filename hint without reading the bytes
Philiprehberger::EncodingKit.guess_from_filename("data.utf8.csv")    # => Encoding::UTF_8
Philiprehberger::EncodingKit.guess_from_filename("legacy.latin1.txt") # => Encoding::ISO_8859_1
Philiprehberger::EncodingKit.guess_from_filename("report.csv")        # => nil

Validity Check

require "philiprehberger/encoding_kit"

Philiprehberger::EncodingKit.valid?("hello")                                # => true
Philiprehberger::EncodingKit.valid?("\xFF\xFE".force_encoding("UTF-8"))     # => false
Philiprehberger::EncodingKit.valid?("hello", encoding: Encoding::US_ASCII)  # => true

API

Method Description
EncodingKit.detect(string) Detect encoding via BOM and heuristics, returns a DetectionResult with .encoding and .confidence
EncodingKit.detect_stream(io, sample_size: 4096) Detect encoding from an IO stream by sampling bytes
EncodingKit.analyze(string) Analyze byte distribution and return encoding candidates with stats
EncodingKit.transcode(string, to:, fallback:, replace:) Auto-detect source and convert to target encoding
EncodingKit.to_utf8(string, from: nil) Convert to UTF-8, auto-detect source if from is nil
EncodingKit.normalize(string) Force to valid UTF-8, replacing bad bytes with U+FFFD
EncodingKit.valid?(string, encoding: nil) Check if string is valid in given or current encoding
EncodingKit.convert(string, from:, to:) Convert between arbitrary encodings
EncodingKit.strip_bom(string) Remove byte order mark if present
EncodingKit.bom?(string) Check if string starts with a BOM
EncodingKit.detect_file(path, sample_size: 4096) Detect encoding of a file by reading a byte sample
EncodingKit.read_as_utf8(path, from: nil) Read a file and return its content as UTF-8
EncodingKit.file_valid?(path, encoding: nil) Check if a file's content is valid in the given encoding
EncodingKit.guess_from_filename(path) Guess Encoding from filename suffixes (e.g. .utf8, .latin1), nil if unknown

Development

bundle install
bundle exec rspec
bundle exec rubocop

Support

If you find this project useful:

Star the repo

🐛 Report issues

💡 Suggest features

❤️ Sponsor development

🌐 All Open Source Projects

💻 GitHub Profile

🔗 LinkedIn Profile

License

MIT