Class: Pubid::Csa::Parser

Inherits:
Parslet::Parser
  • Object
show all
Defined in:
lib/pubid/csa/parser.rb

Instance Method Summary collapse

Instance Method Details

#parse(input) ⇒ Object

Preprocessing to normalize input

Raises:

  • (Parslet::ParseFailed)


368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
# File 'lib/pubid/csa/parser.rb', line 368

def parse(input)
  # Skip comment lines
  raise Parslet::ParseFailed.new("Comment line") if input.strip.start_with?("#")

  # Remove CONSOLIDATED notation FIRST (before other processing)
  normalized = input.gsub(/\s*\(\s*CONSOLIDATED\s*\)\s*/, " ")
  normalized = normalized.gsub(/\s*\bCONSOLIDATED\b\s*/, " ")

  # Fix missing space before reaffirmation
  # Matches: XX(RYYYY), XXX(RYY), or any non-space chars before (R
  # Examples: 04(R2009) -> 04 (R2009), 16(R24) -> 16 (R24)
  normalized = normalized.gsub(/(\S)\(R(\d{2,4})\)/, '\1 (R\2)')

  # DO NOT normalize NO., let it be parsed as a separate identifier component

  # Normalize CEI/IEC to IEC (CEI is French name for IEC)
  normalized = normalized.gsub("CEI/IEC", "IEC")
  normalized = normalized.gsub(/\bCEI\b/, "IEC")

  # Track original publisher prefix
  publisher_prefix = if normalized.start_with?("CAN/CSA-")
                       "CAN/CSA-"
                     elsif normalized.start_with?("CAN3-")
                       "CAN3-"
                     elsif normalized.start_with?("CSA ")
                       "CSA"
                     end

  # Normalize CAN/CSA- and CAN3- to CSA (preserving space before code)
  normalized = normalized.gsub("CAN/CSA-", "CSA ")
  normalized = normalized.gsub("CAN3-", "CSA ")

  # Clean up extra spaces
  normalized = normalized.gsub(/\s+/, " ").strip

  # Parse and inject publisher_prefix into result
  result = super(normalized)

  # Inject publisher_prefix if we have one
  if publisher_prefix && result.is_a?(Hash)
    inject_publisher_prefix(result, publisher_prefix)
  end

  result
end