Class: Exwiw::Adapter::MongodbAdapter

Inherits:
Base
  • Object
show all
Defined in:
lib/exwiw/adapter/mongodb_adapter.rb

Defined Under Namespace

Classes: EmbeddedMask, MaskPlan, StreamingResult

Constant Summary collapse

DEFAULT_BULK_INSERT_CHUNK_SIZE =

Bound how many documents are serialized at once when a collection config carries no explicit bulk_insert_chunk_size. A MongoDB dump is one JSONL line per document and, without chunking, the Runner would materialize the entire collection’s output as a single giant string while the full in-memory result set is still alive — doubling peak memory on large or embed-heavy collections. Chunking lets the Runner stream each slice to the file and release its serialized string (and the transient extended-JSON trees) before building the next.

1_000
INDEX_OPTION_ALLOWLIST =

Index options copied through to the emitted createIndex call. Anything else (‘v`, `ns`, server-internal fields) is dropped — they would either be rejected by createIndex or are not portable across mongod versions.

%w[
  unique sparse hidden expireAfterSeconds collation
  partialFilterExpression wildcardProjection
].freeze
PLACEHOLDER_PATTERN =
/\{([^{}]+)\}/

Instance Attribute Summary

Attributes inherited from Base

#connection_config

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Base

#commented_sql, #post_insert_sql, #pre_insert_sql, #query_comment_text, #sql_query_comment, #to_copy_from_stdin

Constructor Details

#initialize(connection_config, logger) ⇒ MongodbAdapter

Returns a new instance of MongodbAdapter.



68
69
70
71
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 68

def initialize(connection_config, logger)
  super
  @state = {}
end

Class Method Details

.table_config_classObject



13
14
15
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 13

def self.table_config_class
  Exwiw::MongodbCollectionConfig
end

Instance Method Details

#build_query(config, dump_target, config_by_name) ⇒ Object



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 85

def build_query(config, dump_target, config_by_name)
  if config.embedded?
    raise NotImplementedError,
          "MongodbAdapter#build_query was called with embedded config '#{config.name}'. " \
          "Embedded configs are masked through the parent collection."
  end

  reject_filter!(config)
  # Stash the embedded-children index for the matching to_bulk_insert call
  # below. The Adapter contract does not pass config_by_name to
  # to_bulk_insert (SQL adapters don't need it), so we rely on the Runner
  # invariant that build_query is always called before to_bulk_insert for
  # the same config.
  @embedded_children_by_parent = index_embedded_children(config_by_name)

  # Which of this collection's fields downstream children will `$in`-match
  # against (always including primary_key). Stashed for the matching
  # #execute call to capture, by the same build_query-before-execute
  # invariant the embedded index relies on.
  @propagation_keys = propagation_keys_for(config, config_by_name)

  filter =
    if config.name == dump_target.table_name
      # `--ids-field` may override which field --ids is matched against;
      # otherwise fall back to the primary key. Note this only changes the
      # WHERE filter on the target collection — downstream foreign-key
      # propagation keys off each child belongs_to's `references` field
      # (default: the parent primary_key); see #execute, which stashes
      # those fields into @state.
      #
      # Type coercion is only applied to the primary key (`_id`), whose
      # stored type we know (Mongoid's default ObjectId). For a custom
      # `ids_field` the stored type is unknown, so the textual --ids are
      # left as Strings rather than guessed at — the caller passes values
      # matching the field's actual type.
      if dump_target.ids_field
        { dump_target.ids_field => { "$in" => dump_target.ids } }
      else
        { config.primary_key => { "$in" => coerce_ids(dump_target.ids) } }
      end
    else
      related_collection_filter(config, config_by_name)
    end

  Exwiw::MongoQuery::Find.new(
    collection: config.name,
    primary_key: config.primary_key,
    filter: filter,
    projection: build_projection(config, @propagation_keys),
  )
end

#default_bulk_insert_chunk_sizeObject



201
202
203
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 201

def default_bulk_insert_chunk_size
  DEFAULT_BULK_INSERT_CHUNK_SIZE
end

#describe_query(query) ⇒ Object



181
182
183
184
185
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 181

def describe_query(query)
  "find collection=#{query.collection} filter=#{query.filter.inspect} projection=#{query.projection.inspect}"
rescue => e
  "<unavailable: #{e.class}: #{e.message}>"
end

#dump_schema(ordered_tables, output_path) ⇒ Object



217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 217

def dump_schema(ordered_tables, output_path)
  require 'json'

  collections = ordered_tables.reject(&:embedded?)

  # Index listing targets a specific collection, and MongoDB raises
  # NamespaceNotFound (code 26) for one that does not exist. The schema may
  # declare collections absent from this database (schema/DB drift, or a
  # sparse dev DB), so resolve the set that actually exists up front and emit
  # indexes only for those. `createCollection` is still emitted for every
  # config below, so the target schema is created in full regardless.
  existing_collections = db.database.collection_names.to_set

  File.open(output_path, 'w') do |file|
    file.puts("// Auto-generated by exwiw. Apply with: mongosh \"$MONGODB_URI\" #{File.basename(output_path)}")
    file.puts

    collections.each do |config|
      name = config.name
      file.puts(%(try { db.createCollection(#{JSON.generate(name)}); } catch (e) { if (e.code !== 48) throw e; }))
    end
    file.puts

    collections.each do |config|
      name = config.name
      unless existing_collections.include?(name)
        @logger.debug("  Collection '#{name}' is not present in the source database; emitting no indexes.")
        next
      end

      indexes = db[name].indexes.to_a.reject { |idx| idx['name'] == '_id_' }
      indexes.each do |idx|
        key = idx['key']
        opts = idx.slice(*INDEX_OPTION_ALLOWLIST)
        opts['name'] = idx['name'] if idx['name']
        file.puts(%(db.getCollection(#{JSON.generate(name)}).createIndex(#{JSON.generate(key)}, #{JSON.generate(opts)});))
      end
    end
  end
  @logger.info("  Wrote schema for #{collections.size} collection(s) to #{output_path}.")
end

#dumpable?(config) ⇒ Boolean

Returns:

  • (Boolean)


73
74
75
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 73

def dumpable?(config)
  !config.embedded?
end

#execute(query) ⇒ Object



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 137

def execute(query)
  @logger.debug("  Executing Mongo find on '#{query.collection}': filter=#{query.filter.inspect} projection=#{query.projection.inspect}")

  view = db[query.collection]
    .find(query.filter)
    .projection(query.projection)
    .comment(query_comment_text("collection=#{query.collection}"))

  # Per referenced field, the values children will `$in`-match against.
  # @propagation_keys is set by the build_query call for this same
  # collection; fall back to the primary key if execute is driven without a
  # preceding build_query (e.g. in isolation from a test).
  keys = @propagation_keys || [query.primary_key]

  # Return a streaming view of the result set rather than `.to_a`-ing the
  # whole collection into memory. The Runner pulls documents through
  # `each_slice`, so only one chunk's worth is resident at a time even for
  # large / embed-heavy collections — the dump's dominant memory cost. The
  # propagation-key values are captured as the cursor streams and published
  # into @state once the pass completes (see StreamingResult).
  StreamingResult.new(view: view, collection: query.collection, keys: keys, state: @state)
end

#explain(_query) ⇒ Object

Raises:

  • (NotImplementedError)


177
178
179
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 177

def explain(_query)
  raise NotImplementedError, "MongodbAdapter does not support explain yet"
end

#output_extensionObject



187
188
189
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 187

def output_extension
  'jsonl'
end

#schema_output_extensionObject



205
206
207
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 205

def schema_output_extension
  'js'
end

#supports_bulk_delete?Boolean

Returns:

  • (Boolean)


259
260
261
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 259

def supports_bulk_delete?
  false
end

#to_bulk_delete(_query, _config) ⇒ Object

Raises:

  • (NotImplementedError)


173
174
175
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 173

def to_bulk_delete(_query, _config)
  raise NotImplementedError, "MongodbAdapter does not support bulk delete"
end

#to_bulk_insert(rows, config) ⇒ Object

NOTE: relies on @embedded_children_by_parent set by a prior build_query call for the same config. This implicit ordering exists because the Adapter contract intentionally does not thread config_by_name through to_bulk_insert (SQL adapters don’t need it). Safe in Runner, fragile in tests — call build_query first.



165
166
167
168
169
170
171
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 165

def to_bulk_insert(rows, config)
  plan = mask_plan(config)
  rows.map do |doc|
    apply_mask_plan!(doc, plan)
    JSON.generate(extended_json(doc))
  end.join("\n")
end

#validate_as_dump_target!(config) ⇒ Object

Raises:

  • (NotImplementedError)


77
78
79
80
81
82
83
# File 'lib/exwiw/adapter/mongodb_adapter.rb', line 77

def validate_as_dump_target!(config)
  return unless config.embedded?

  raise NotImplementedError,
        "dump_target '#{config.name}' is an embedded MongodbCollectionConfig; " \
        "specify a top-level collection instead."
end