Module: Tracelit::Instrumentation

Defined in:
lib/tracelit/instrumentation.rb

Constant Summary collapse

SETUP_MUTEX =
Mutex.new

Class Method Summary collapse

Class Method Details

.detect_frameworkObject

Detects the web framework in use for the telemetry.sdk.name attribute. This value appears as the ‘framework` column in the services table.



157
158
159
160
161
162
# File 'lib/tracelit/instrumentation.rb', line 157

def self.detect_framework
  return "rails"   if defined?(::Rails)
  return "sinatra" if defined?(::Sinatra)
  return "rack"    if defined?(::Rack)
  "ruby"
end

.error_always_on_sampler(rate) ⇒ Object

Returns an ErrorAlwaysOnSampler wrapped in ParentBased so child spans honour the parent’s sampling decision. ErrorAlwaysOnSampler upgrades DROP → RECORD_ONLY so that ErrorSpanProcessor.on_finish fires for all spans, allowing error spans to be exported even outside the sampling ratio.



168
169
170
171
172
# File 'lib/tracelit/instrumentation.rb', line 168

def self.error_always_on_sampler(rate)
  OpenTelemetry::SDK::Trace::Samplers.parent_based(
    root: Tracelit::ErrorAlwaysOnSampler.new(rate)
  )
end

.install_fork_hook(config) ⇒ Object

Fix 5: Register a Process._fork hook (Ruby 3.1+) so that background polling threads are restarted inside each forked Puma/Unicorn worker. In the parent (pid != 0) nothing changes. In the child (pid == 0) we restart the metric pollers so each worker reports its own stats.



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/tracelit/instrumentation.rb', line 211

def self.install_fork_hook(config)
  return unless Process.respond_to?(:_fork)

  hook_module = Module.new do
    define_method(:_fork) do
      pid = super()
      if pid == 0
        # We are in the child — restart pollers for this worker
        Tracelit::Metrics.restart_pollers(config)
      end
      pid
    end
  end

  Process.singleton_class.prepend(hook_module)
rescue StandardError => e
  OpenTelemetry.logger.warn("[Tracelit] could not install fork hook: #{e.message}")
end

.reset!Object



141
142
143
144
145
146
# File 'lib/tracelit/instrumentation.rb', line 141

def self.reset!
  SETUP_MUTEX.synchronize do
    @configured = false
    @config     = nil
  end
end

.setup(config) ⇒ Object

Sets up the OpenTelemetry SDK with the Tracelit OTLP exporter. Called once at application boot. Idempotent — safe to call multiple times. Never raises — a misconfigured SDK must not crash the host application.



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/tracelit/instrumentation.rb', line 20

def self.setup(config)
  SETUP_MUTEX.synchronize do
    return if @configured
    return unless config.enabled

    # Fix 1: Install a clean single-line error handler before any OTel SDK
    # calls so that internal OTel errors never dump raw stack traces into
    # the application's logs.
    OpenTelemetry.error_handler = lambda do |exception:, message:|
      msg = [message, exception&.message].compact.join("")
      OpenTelemetry.logger.warn("[Tracelit] #{msg}")
    end

    # Fix 2/3: Soft validation — warn and bail out rather than raise.
    # An observability SDK must never crash the host application.
    errors = config.valid?
    if errors.any?
      OpenTelemetry.logger.warn("[Tracelit] disabled — #{errors.join(', ')}")
      return
    end

    OpenTelemetry::SDK.configure do |otel|
      # Resource attributes identify this service in Tracelit.
      # Keys are coerced to strings; non-primitive values are dropped to
      # prevent ConfigurationError from Resource.create validation.
      base_attrs = {
        OpenTelemetry::SemanticConventions::Resource::SERVICE_NAME         => config.resolved_service_name,
        OpenTelemetry::SemanticConventions::Resource::DEPLOYMENT_ENVIRONMENT => config.environment,
        "telemetry.sdk.language" => "ruby",
        "telemetry.sdk.name"     => detect_framework,
        "telemetry.sdk.version"  => Tracelit::VERSION,
      }
      sha = config.resolved_commit_sha
      base_attrs["service.commit_sha"] = sha if sha

      # Resource.create also reads OTEL_RESOURCE_ATTRIBUTES from the
      # environment and merges in Resource.default (process, OS info, etc.)
      # Any of those sources can carry non-primitive values and raise
      # ConfigurationError / ArgumentError. If that happens we fall back to
      # an empty resource so the SDK configure block can still complete and
      # install the real TracerProvider — traces will work, only the custom
      # labels are missing. A clear warning tells the operator what to fix.
      begin
        otel.resource = OpenTelemetry::SDK::Resources::Resource.create(
          base_attrs.merge(config.sanitized_resource_attributes)
        )
      rescue ArgumentError, OpenTelemetry::SDK::ConfigurationError => e
        OpenTelemetry.logger.warn(
          "[Tracelit] could not set resource attributes: #{e.message}. " \
          "Check OTEL_RESOURCE_ATTRIBUTES and config.resource_attributes for " \
          "non-string/integer/float/boolean values. Continuing with default resource."
        )
      end

      # Build the OTLP exporter once — shared by both processors
      exporter = OpenTelemetry::Exporter::OTLP::Exporter.new(
        endpoint: "#{config.endpoint}/v1/traces",
        headers: {
          "Authorization"  => "Bearer #{config.api_key}",
          "X-Service-Name" => config.resolved_service_name,
          "X-Environment"  => config.environment,
        }
      )

      # Primary processor: batches and exports sampled spans
      otel.add_span_processor(
        OpenTelemetry::SDK::Trace::Export::BatchSpanProcessor.new(exporter)
      )

      # Error processor: always exports error spans regardless of
      # sampling decision — fires on_finish after status is known
      otel.add_span_processor(
        Tracelit::ErrorSpanProcessor.new(exporter)
      )

      # Auto-instrumentation: instruments Rails, Rack, ActiveRecord,
      # Action View, Net::HTTP, Faraday, Redis, Sidekiq, and more.
      # use_all() enables every installed instrumentation gem.
      otel.use_all
    end

    # Guard: if the SDK configure block failed internally (e.g. a bad
    # resource attribute or instrumentation error caught by the error
    # handler), the global tracer provider is still the ProxyTracerProvider
    # and does not respond to .resource. Detect this and bail out cleanly
    # instead of letting setup_logs / Metrics.setup fail with cryptic errors.
    unless OpenTelemetry.tracer_provider.respond_to?(:resource)
      OpenTelemetry.logger.warn(
        "[Tracelit] OTel SDK did not initialize correctly — " \
        "tracer provider is still a proxy. " \
        "Check the configuration errors logged above. " \
        "Logs and metrics pipelines will not start."
      )
      return
    end

    # Set sampler after configure — Configurator does not expose
    # sampler= in OTel SDK 1.x, must be set on the provider directly.
    # Skip at 1.0: the default AlwaysOn sampler is correct and we do not touch it.
    if config.sample_rate < 1.0
      OpenTelemetry.tracer_provider.sampler = error_always_on_sampler(config.sample_rate)
    end

    @configured = true
    @config     = config

    setup_logs(config)
    Tracelit::Metrics.setup(config)

    # Fix 5: Fork safety for Puma cluster mode and Unicorn.
    # Background threads (pollers) are killed in forked worker processes.
    # Process._fork (Ruby 3.1+) fires in the child after every fork so we
    # can restart pollers in each worker without touching the master.
    install_fork_hook(config)

    # Fix 9: Flush and shut down both providers gracefully on process exit
    # so the last metrics/traces batch is not lost during deploys.
    at_exit { shutdown }
  end
end

.setup_logs(config) ⇒ Object

Sets up the OTel Logs SDK: creates a LoggerProvider, attaches a BatchLogRecordProcessor with an OTLP/HTTP exporter, registers it globally, and installs the Rails.logger bridge.



177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/tracelit/instrumentation.rb', line 177

def self.setup_logs(config)
  logs_exporter = OpenTelemetry::Exporter::OTLP::Logs::LogsExporter.new(
    endpoint: "#{config.endpoint}/v1/logs",
    headers: {
      "Authorization"  => "Bearer #{config.api_key}",
      "X-Service-Name" => config.resolved_service_name,
      "X-Environment"  => config.environment,
    }
  )

  tp = OpenTelemetry.tracer_provider
  resource = tp.respond_to?(:resource) ? tp.resource : OpenTelemetry::SDK::Resources::Resource.create({})

  logger_provider = OpenTelemetry::SDK::Logs::LoggerProvider.new(
    resource: resource
  )

  logger_provider.add_log_record_processor(
    OpenTelemetry::SDK::Logs::Export::BatchLogRecordProcessor.new(logs_exporter)
  )

  OpenTelemetry.logger_provider = logger_provider

  # Install the Rails.logger → OTel bridge after the provider is ready.
  # Called here (after Rails boot) so Rails.logger is already initialised.
  RailsLoggerBridge.install(logger_provider)
rescue StandardError => e
  OpenTelemetry.logger.warn("[Tracelit] failed to set up logs: #{e.message}")
end

.shutdownObject



148
149
150
151
# File 'lib/tracelit/instrumentation.rb', line 148

def self.shutdown
  OpenTelemetry.tracer_provider.shutdown rescue nil
  OpenTelemetry.meter_provider.shutdown  rescue nil
end