Class: Pubid::Ashrae::Parser

Inherits:
Parslet::Parser
  • Object
show all
Defined in:
lib/pubid/ashrae/parser.rb

Overview

Parser class for ASHRAE identifiers Single Responsibility: Parsing ASHRAE identifier syntax

Class Method Summary collapse

Class Method Details

.parse(string) ⇒ Object



652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
# File 'lib/pubid/ashrae/parser.rb', line 652

def self.parse(string)
  # Strip leading/trailing whitespace
  cleaned = string.strip

  # Handle fixture format: !original!canonical
  # Extract just the original identifier for parsing
  if cleaned.start_with?("!")
    parts = cleaned.split("!")
    # Format: "" (before first !) + "original" + "canonical"
    cleaned = parts[1] if parts.size >= 3
  end

  # Normalize multiple spaces to single space
  cleaned = cleaned.gsub(/\s+/, " ")

  # Normalize reaffirmation patterns
  cleaned = cleaned.gsub(/\(RA\s+(\d{4})\)/, "(RA \\1)")
  cleaned = cleaned.gsub(/RA\s+(\d{4})/, "RA \\1")

  # Remove trailing periods and commas
  cleaned = cleaned.gsub(/[,.]$/, "")

  # Remove trailing double parentheses (typos in source data)
  cleaned = cleaned.gsub("))", ")")

  # Fix unclosed trailing parenthesis (data truncation issue)
  open_count = cleaned.count("(")
  close_count = cleaned.count(")")
  if open_count > close_count
    cleaned += ")" * (open_count - close_count)
  end

  # Normalize dash patterns (remove space around dash in year/code)
  cleaned = cleaned.gsub(/-\s+/, "-")
  cleaned = cleaned.gsub(/\s+/, " ") # Clean up any double spaces created

  # Preprocess ", and" and " and" patterns in addenda code lists
  # This fixes greedy matching where "and" gets consumed as a code
  # Only replace "and" in code lists (between Addendum/Addenda and to/for/(end)
  # NOT in descriptive text that follows
  if cleaned =~ /\bAddenda\s+/ || cleaned =~ /\bAddendum\s+/
    # Find the boundary where code list ends (to, for, or opening paren)
    boundary_pattern = /\s+(?:to|for|\()/
    boundary_match = cleaned.match(boundary_pattern)

    if boundary_match
      # Only replace "and" in the portion before the boundary
      boundary_pos = cleaned.index(boundary_match[0])
      before_boundary = cleaned[0...boundary_pos]
      after_boundary = cleaned[boundary_pos..]

      # Replace ", and" and " and" in the code list portion only
      before_boundary = before_boundary.gsub(/,\s+and\s+/i, ", ")
      before_boundary = before_boundary.gsub(/\s+and\s+/i, ", ")

      cleaned = before_boundary + after_boundary
    else
      # No boundary found, apply to entire string
      cleaned = cleaned.gsub(/,\s+and\s+/i, ", ")
      cleaned = cleaned.gsub(/\s+and\s+/i, ", ")
    end

    # Normalize "Addendum" to "Addenda" when followed by multiple codes (comma-separated)
    # Pattern: "Addendum a, b" -> "Addenda a, b"
    cleaned = cleaned.gsub(/\bAddendum\s+([a-z]+)\s*,\s*/i,
                           "Addenda \\1, ")
  end

  new.parse(cleaned)
end