Class: Woods::ExtractedUnit

Inherits:
Object
  • Object
show all
Defined in:
lib/woods/extracted_unit.rb

Overview

ExtractedUnit represents a single meaningful unit of code from the codebase.

This could be a model, controller, service, component, or framework source. Each unit is self-contained with its source code, metadata, and relationship information. Units are serialized to JSON for consumption by the indexing pipeline.

Examples:

Creating a model unit

unit = ExtractedUnit.new(
  type: :model,
  identifier: "User",
  file_path: "app/models/user.rb"
)
unit.source_code = File.read(unit.file_path)
unit.metadata = { associations: [...], callbacks: [...] }
unit.dependencies = [{ type: :service, target: "UserService" }]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(type:, identifier:, file_path:) ⇒ ExtractedUnit

Returns a new instance of ExtractedUnit.



35
36
37
38
39
40
41
42
43
# File 'lib/woods/extracted_unit.rb', line 35

def initialize(type:, identifier:, file_path:)
  @type = type
  @identifier = identifier
  @file_path = file_path
  @metadata = {}
  @dependencies = []
  @dependents = []
  @chunks = []
end

Instance Attribute Details

#chunksObject

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27
# File 'lib/woods/extracted_unit.rb', line 25

def chunks
  @chunks
end

#dependenciesObject

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27
# File 'lib/woods/extracted_unit.rb', line 25

def dependencies
  @dependencies
end

#dependentsObject

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27
# File 'lib/woods/extracted_unit.rb', line 25

def dependents
  @dependents
end

#file_pathObject

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27
# File 'lib/woods/extracted_unit.rb', line 25

def file_path
  @file_path
end

#identifierObject

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27
# File 'lib/woods/extracted_unit.rb', line 25

def identifier
  @identifier
end

#metadataObject

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27
# File 'lib/woods/extracted_unit.rb', line 25

def 
  @metadata
end

#namespaceObject

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27
# File 'lib/woods/extracted_unit.rb', line 25

def namespace
  @namespace
end

#source_codeObject

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27
# File 'lib/woods/extracted_unit.rb', line 25

def source_code
  @source_code
end

#typeObject

Symbol: :model, :controller, :service, :component, :job, :rails_source, :gem_source



25
26
27
# File 'lib/woods/extracted_unit.rb', line 25

def type
  @type
end

Instance Method Details

#build_default_chunks(max_tokens: 1500) ⇒ Array<Hash>

Build semantic chunks for large units Preserves context by including unit header in each chunk

Parameters:

  • max_tokens (Integer) (defaults to: 1500)

    Maximum tokens per chunk

Returns:

  • (Array<Hash>)

    List of chunk hashes



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/woods/extracted_unit.rb', line 89

def build_default_chunks(max_tokens: 1500)
  return [] unless needs_chunking?

  chunks = []
  current_chunk = []
  current_tokens = 0

  # Always include a header with unit context
  header = build_chunk_header
  header_tokens = (header.length / 4.0).ceil

  source_code.lines.each do |line|
    line_tokens = (line.length / 4.0).ceil

    if current_tokens + line_tokens > max_tokens && current_chunk.any?
      content = header + current_chunk.join
      chunks << {
        chunk_index: chunks.size,
        identifier: "#{identifier}#chunk_#{chunks.size}",
        content: content,
        content_hash: Digest::SHA256.hexdigest(content),
        estimated_tokens: current_tokens + header_tokens
      }
      current_chunk = []
      current_tokens = 0
    end

    current_chunk << line
    current_tokens += line_tokens
  end

  # Final chunk
  if current_chunk.any?
    content = header + current_chunk.join
    chunks << {
      chunk_index: chunks.size,
      identifier: "#{identifier}#chunk_#{chunks.size}",
      content: content,
      content_hash: Digest::SHA256.hexdigest(content),
      estimated_tokens: current_tokens + header_tokens
    }
  end

  chunks
end

#estimated_tokensInteger

Estimate token count for chunking decisions. Benchmarked against tiktoken (cl100k_base) on 19 Ruby source files. Actual mean is 4.41 chars/token. Uses 4.0 as a conservative floor (~10.6% overestimate). See docs/TOKEN_BENCHMARK.md.

Returns:

  • (Integer)

    Estimated token count



70
71
72
73
74
# File 'lib/woods/extracted_unit.rb', line 70

def estimated_tokens
  source_tokens = source_code ? (source_code.length / 4.0).ceil : 0
   = .any? ? (.to_json.length / 4.0).ceil : 0
  source_tokens + 
end

#needs_chunking?(threshold: 1500) ⇒ Boolean

Check if unit needs chunking based on size

Parameters:

  • threshold (Integer) (defaults to: 1500)

    Token threshold for chunking (default: 1500)

Returns:

  • (Boolean)


80
81
82
# File 'lib/woods/extracted_unit.rb', line 80

def needs_chunking?(threshold: 1500)
  estimated_tokens > threshold
end

#to_hHash

Serialize to hash for JSON output

Returns:

  • (Hash)

    Complete unit data for indexing pipeline



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/woods/extracted_unit.rb', line 48

def to_h
  {
    type: type,
    identifier: identifier,
    file_path: file_path,
    namespace: namespace,
    source_code: source_code,
    metadata: ,
    dependencies: dependencies,
    dependents: dependents,
    chunks: chunks,
    extracted_at: Time.now.iso8601,
    source_hash: Digest::SHA256.hexdigest(source_code || '')
  }
end