Class: Pubid::Csa::Parser
- Inherits:
-
Parslet::Parser
- Object
- Parslet::Parser
- Pubid::Csa::Parser
- Defined in:
- lib/pubid/csa/parser.rb
Instance Method Summary collapse
-
#parse(input) ⇒ Object
Preprocessing to normalize input.
Instance Method Details
#parse(input) ⇒ Object
Preprocessing to normalize input
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 |
# File 'lib/pubid/csa/parser.rb', line 368 def parse(input) # Skip comment lines raise Parslet::ParseFailed.new("Comment line") if input.strip.start_with?("#") # Remove CONSOLIDATED notation FIRST (before other processing) normalized = input.gsub(/\s*\(\s*CONSOLIDATED\s*\)\s*/, " ") normalized = normalized.gsub(/\s*\bCONSOLIDATED\b\s*/, " ") # Fix missing space before reaffirmation # Matches: XX(RYYYY), XXX(RYY), or any non-space chars before (R # Examples: 04(R2009) -> 04 (R2009), 16(R24) -> 16 (R24) normalized = normalized.gsub(/(\S)\(R(\d{2,4})\)/, '\1 (R\2)') # DO NOT normalize NO., let it be parsed as a separate identifier component # Normalize CEI/IEC to IEC (CEI is French name for IEC) normalized = normalized.gsub("CEI/IEC", "IEC") normalized = normalized.gsub(/\bCEI\b/, "IEC") # Track original publisher prefix publisher_prefix = if normalized.start_with?("CAN/CSA-") "CAN/CSA-" elsif normalized.start_with?("CAN3-") "CAN3-" elsif normalized.start_with?("CSA ") "CSA" end # Normalize CAN/CSA- and CAN3- to CSA (preserving space before code) normalized = normalized.gsub("CAN/CSA-", "CSA ") normalized = normalized.gsub("CAN3-", "CSA ") # Clean up extra spaces normalized = normalized.gsub(/\s+/, " ").strip # Parse and inject publisher_prefix into result result = super(normalized) # Inject publisher_prefix if we have one if publisher_prefix && result.is_a?(Hash) inject_publisher_prefix(result, publisher_prefix) end result end |