Class: Woods::ExtractedUnit

Inherits:

Object

Object
Woods::ExtractedUnit

show all

Defined in:: lib/woods/extracted_unit.rb

Overview

ExtractedUnit represents a single meaningful unit of code from the codebase.

This could be a model, controller, service, component, or framework source. Each unit is self-contained with its source code, metadata, and relationship information. Units are serialized to JSON for consumption by the indexing pipeline.

Examples:

Creating a model unit

unit = ExtractedUnit.new(
  type: :model,
  identifier: "User",
  file_path: "app/models/user.rb"
)
unit.source_code = File.read(unit.file_path)
unit.metadata = { associations: [...], callbacks: [...] }
unit.dependencies = [{ type: :service, target: "UserService" }]

Instance Attribute Summary collapse

#chunks ⇒ Object

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source.
#dependencies ⇒ Object

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source.
#dependents ⇒ Object

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source.
#file_path ⇒ Object

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source.
#identifier ⇒ Object

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source.
#metadata ⇒ Object

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source.
#namespace ⇒ Object

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source.
#source_code ⇒ Object

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source.
#type ⇒ Object

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source.

Instance Method Summary collapse

#build_default_chunks(max_tokens: 1500) ⇒ Array<Hash>

Build semantic chunks for large units Preserves context by including unit header in each chunk.
#estimated_tokens ⇒ Integer

Estimate token count for chunking decisions.
#initialize(type:, identifier:, file_path:) ⇒ ExtractedUnit constructor

A new instance of ExtractedUnit.
#needs_chunking?(threshold: 1500) ⇒ Boolean

Check if unit needs chunking based on size.
#to_h ⇒ Hash

Serialize to hash for JSON output.

Constructor Details

#initialize(type:, identifier:, file_path:) ⇒ `ExtractedUnit`

Returns a new instance of ExtractedUnit.

# File 'lib/woods/extracted_unit.rb', line 35

def initialize(type:, identifier:, file_path:)
  @type = type
  @identifier = identifier
  @file_path = file_path
  @metadata = {}
  @dependencies = []
  @dependents = []
  @chunks = []
end

Instance Attribute Details

#chunks ⇒ `Object`

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27

# File 'lib/woods/extracted_unit.rb', line 25

def chunks
  @chunks
end

#dependencies ⇒ `Object`

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27

# File 'lib/woods/extracted_unit.rb', line 25

def dependencies
  @dependencies
end

#dependents ⇒ `Object`

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27

# File 'lib/woods/extracted_unit.rb', line 25

def dependents
  @dependents
end

#file_path ⇒ `Object`

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27

# File 'lib/woods/extracted_unit.rb', line 25

def file_path
  @file_path
end

#identifier ⇒ `Object`

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27

# File 'lib/woods/extracted_unit.rb', line 25

def identifier
  @identifier
end

#metadata ⇒ `Object`

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27

# File 'lib/woods/extracted_unit.rb', line 25

def metadata
  @metadata
end

#namespace ⇒ `Object`

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27

# File 'lib/woods/extracted_unit.rb', line 25

def namespace
  @namespace
end

#source_code ⇒ `Object`

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27

# File 'lib/woods/extracted_unit.rb', line 25

def source_code
  @source_code
end

#type ⇒ `Object`

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27

# File 'lib/woods/extracted_unit.rb', line 25

def type
  @type
end

Instance Method Details

#build_default_chunks(max_tokens: 1500) ⇒ `Array<Hash>`

Build semantic chunks for large units Preserves context by including unit header in each chunk

Parameters:

max_tokens (Integer) (defaults to: 1500) —

Maximum tokens per chunk

Returns:

(Array<Hash>) —

List of chunk hashes

# File 'lib/woods/extracted_unit.rb', line 89

def build_default_chunks(max_tokens: 1500)
  return [] unless needs_chunking?

  chunks = []
  current_chunk = []
  current_tokens = 0

  # Always include a header with unit context
  header = build_chunk_header
  header_tokens = (header.length / 4.0).ceil

  source_code.lines.each do |line|
    line_tokens = (line.length / 4.0).ceil

    if current_tokens + line_tokens > max_tokens && current_chunk.any?
      content = header + current_chunk.join
      chunks << {
        chunk_index: chunks.size,
        identifier: "#{identifier}#chunk_#{chunks.size}",
        content: content,
        content_hash: Digest::SHA256.hexdigest(content),
        estimated_tokens: current_tokens + header_tokens
      }
      current_chunk = []
      current_tokens = 0
    end

    current_chunk << line
    current_tokens += line_tokens
  end

  # Final chunk
  if current_chunk.any?
    content = header + current_chunk.join
    chunks << {
      chunk_index: chunks.size,
      identifier: "#{identifier}#chunk_#{chunks.size}",
      content: content,
      content_hash: Digest::SHA256.hexdigest(content),
      estimated_tokens: current_tokens + header_tokens
    }
  end

  chunks
end

#estimated_tokens ⇒ `Integer`

Estimate token count for chunking decisions. Benchmarked against tiktoken (cl100k_base) on 19 Ruby source files. Actual mean is 4.41 chars/token. Uses 4.0 as a conservative floor (~10.6% overestimate). See docs/TOKEN_BENCHMARK.md.

Returns:

(Integer) —

Estimated token count

# File 'lib/woods/extracted_unit.rb', line 70

def estimated_tokens
  source_tokens = source_code ? (source_code.length / 4.0).ceil : 0
  metadata_tokens = metadata.any? ? (metadata.to_json.length / 4.0).ceil : 0
  source_tokens + metadata_tokens
end

#needs_chunking?(threshold: 1500) ⇒ `Boolean`

Check if unit needs chunking based on size

Parameters:

threshold (Integer) (defaults to: 1500) —

Token threshold for chunking (default: 1500)

Returns:

(Boolean)



80
81
82

# File 'lib/woods/extracted_unit.rb', line 80

def needs_chunking?(threshold: 1500)
  estimated_tokens > threshold
end

#to_h ⇒ `Hash`

Serialize to hash for JSON output

Returns:

(Hash) —

Complete unit data for indexing pipeline

# File 'lib/woods/extracted_unit.rb', line 48

def to_h
  {
    type: type,
    identifier: identifier,
    file_path: file_path,
    namespace: namespace,
    source_code: source_code,
    metadata: metadata,
    dependencies: dependencies,
    dependents: dependents,
    chunks: chunks,
    extracted_at: Time.now.iso8601,
    source_hash: Digest::SHA256.hexdigest(source_code || '')
  }
end