Class: Pubid::Ieee::Parser

Inherits:
Parslet::Parser
  • Object
show all
Defined in:
lib/pubid/ieee/parser.rb

Overview

Parser class for IEEE identifiers Single Responsibility: Parsing IEEE identifier syntax Note: IEEE is extremely complex with many edge cases

Class Method Summary collapse

Class Method Details

.parse(string) ⇒ Object



797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
# File 'lib/pubid/ieee/parser.rb', line 797

def self.parse(string)
  # Strip .pdf extension if present (Pattern 3: File Extensions)
  cleaned = string.sub(/\.pdf$/i, "")

  # Note: IEC and ANSI identifiers are NOT filtered here because they can have
  # IEEE co-publication or adoption. The Base.parse method handles determining
  # which standards are actually IEEE-related.
  # ISO-only standards are still filtered as they have separate handling.

  # Pattern 3: Replace underscore before ISO stage codes with slash
  # These are joint development drafts that use underscore instead of slash
  cleaned = cleaned.gsub(/_(FDIS|CDV|CD|DIS|WD|PWI|NP)/, '/\1')

  # NEW: Normalize multiple spaces to single space
  # No valid IEEE identifier pattern needs more than 1 space
  cleaned = cleaned.gsub(/\s+/, " ")

  # NEW Session 171: CONSERVATIVE data quality fixes for TODO.IEEE-MUST-DO.txt
  # Only fix clear typos: space before dash + 4-digit year, OR dash + space + 4-digit year
  # Do NOT touch " - " (space-dash-space) which is valid formatting
  cleaned = cleaned.gsub(/(\d)\s+-(\d{4})\b/, '\1-\2')  # "C37.101 -2006" → "C37.101-2006"
  cleaned = cleaned.gsub(/(\d)-\s+(\d{4})\b/, '\1-\2')  # "C62.35- 2010" → "C62.35-2010"

  # NEW Session 171: HTML entity for en dash (–)
  # ONLY convert if not already followed by a dash (avoid creating --)
  cleaned = cleaned.gsub(/–(?!-)/, "-")  # En dash → regular hyphen (if not followed by dash)
  cleaned = cleaned.gsub("–-", "-")      # En-dash-dash → single dash

  # NEW Session 171: Remove wrong ! prefix
  cleaned = cleaned.gsub(/^!IEEE /, "IEEE ")

  # NEW Session 171: Fix "IEEE/ ASTM" spacing (extra space after slash)
  cleaned = cleaned.gsub("IEEE/ ASTM", "IEEE/ASTM")

  # NEW Phase 1: Handle HTML entities comprehensively
  cleaned = cleaned.gsub("™", "") # Trademark symbol
  cleaned = cleaned.gsub("’", "'") # Smart apostrophe
  cleaned = cleaned.gsub("&", "&")   # Double-encoded ampersand
  cleaned = cleaned.gsub("&", "&")       # Single-encoded ampersand

  # NEW: Wrap P&V notation in parentheses (Paper & Video, etc.)
  # Pattern: "IEEE Std 500-1984 P&V" → "IEEE Std 500-1984 (P&V)"
  cleaned = cleaned.gsub(/\s+(P&V)\s*$/, ' (\1)')

  # NEW Phase 1: Fix number spacing issues (e.g., "C57.1 2.25" → "C57.12.25")
  # This handles cases where a space appears in the middle of a number
  cleaned = cleaned.gsub(/(\d+\.\d+)\s+(\d+\.)/, '\1\2')

  # NEW Phase 1: Fix year spacing issues (e.g., "1 996" → "1996")
  # Remove spaces within 4-digit years
  cleaned = cleaned.gsub(/\b(1|2)\s+(\d{3})\b/, '\1\2')

  # NEW: Fix month+year spacing (e.g., "March2016" → "March 2016")
  # Add space between month name and 4-digit year when they're concatenated
  cleaned = cleaned.gsub(
    /\b(January|February|March|April|May|June|July|August|September|October|November|December)(\d{4})\b/, '\1 \2'
  )
  # Also handle abbreviated months
  cleaned = cleaned.gsub(
    /\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)(\d{4})\b/, '\1 \2'
  )

  # NEW: Convert IEC/IEEE space-separated to semicolon format
  # Pattern: "IEC 61523-3 First edition 2004-09; IEEE 1497" → already semicolon
  # Pattern: "IEC 62539 First Edition 2007-07 IEEE 930" → needs semicolon
  # Match: IEC identifier (with edition) + space + IEEE identifier
  # Be conservative: only convert if IEC has "First edition" or similar and followed by IEEE
  cleaned = cleaned.gsub(
    /(IEC\s+\d+(?:-\d+)?(?:\s+First?\s+Edition\s+\d{4}-\d{2})?)\s+(IEEE\s+\S+)/, '\1; \2'
  )

  # NEW Phase 1 (Session 141): Remove literal trademark symbol
  # "C57.110™-2018" → "C57.110-2018"
  cleaned = cleaned.gsub(//, "")

  # NEW Phase 1 (Session 141): Fix specific year typo
  # "19969" → "1969" (very specific pattern, won't affect other text)
  cleaned = cleaned.gsub(/\b19969\b/, "1969")

  # NEW Session 169: Fix comma typo in 802.3 series numbers
  # "802.3ch-2020,802.3ca-2020" → "802.3ch-2020, 802.3ca-2020"
  # Very specific: 4 digits, comma, 3 digits (likely 802.3xx typo)
  cleaned = cleaned.gsub(/(\d{4}),(\d{3})/, '\1, \2')

  # NEW Session 169: Fix /lNT typo (lowercase L as 1)
  # "1003.1/2003.l/lNT" → "1003.1/2003.1/INT"
  cleaned = cleaned.gsub(/\/lNT\b/, "/INT")
  cleaned = cleaned.gsub(".l/", ".1/") # Also fix .l/ -> .1/

  # NEW Session 169: Fix I99O typo (letter I and O instead of digits)
  # "IEEE 1076-CONC-I99O" → "IEEE 1076-CONC-1990"
  cleaned = cleaned.gsub(/\bI99O\b/, "1990")

  # NEW: Fix common typos (Category 9)
  cleaned = cleaned.gsub(/^EEE /, "IEEE ")

  # NEW Session 170: Additional safe typo fixes
  # Fix "I EEE" (space between I and EEE)
  cleaned = cleaned.gsub(/^I EEE /, "IEEE ")

  # Fix "lEEE" (lowercase L instead of I)
  cleaned = cleaned.gsub(/^lEEE /, "IEEE ")

  # Fix missing closing parenthesis at end only (very conservative)
  # Only if there's exactly one more opening than closing paren
  open_count = cleaned.count("(")
  close_count = cleaned.count(")")
  if open_count == close_count + 1 && !cleaned.end_with?(")")
    cleaned = "#{cleaned})"
  end

  # NEW Phase 1: Remove trailing commas/colons and text
  cleaned = cleaned.gsub(/,\s*Standard\s*$/, "") # ", Standard" at end
  cleaned = cleaned.gsub(/[,:]\s*$/, "") # Trailing comma/colon
  cleaned = cleaned.gsub(/,\s+and\s+IEEE\s+Std\s/, " and ") # Handle "IEEE Std and Std" case

  # Enhanced: Fix unbalanced parentheses comprehensively
  # Handle three cases: missing closing, extra opening, nested unbalanced
  open_count = cleaned.count("(")
  close_count = cleaned.count(")")

  if open_count > close_count
    # More opening than closing - add closing parens at end
    # This handles both simple missing and nested unbalanced cases
    missing = open_count - close_count
    cleaned = cleaned + (")" * missing)
  elsif close_count > open_count
    # More closing than opening - remove extra closing from end
    # Very conservative: only remove trailing excess closing parens
    extra = close_count - open_count
    cleaned = cleaned.sub(/\){#{extra}}$/, "")
  end

  # === SESSION 173: TODO.IEEE-MUST-DO.txt Preprocessing Enhancements ===

  # Part A: Simple Normalizations (Lines 13, 16, 32-35, 36, 39-41 from TODO)

  # 1. Missing dash before year: "802.16g 2007" → "802.16g-2007"
  # But be careful not to affect month names (already have space)
  # Only apply if: digit + space + 4-digit year (and not after a month name)
  cleaned = cleaned.gsub(/(\d)\s+(\d{4})(?=\s*\(|\s*$)/, '\1-\2')

  # 2. Space-dash-space before year: "802.1ag - 2007" → "802.1ag-2007"
  # This is distinct from " - " in titles, targets space-dash-space-year pattern
  cleaned = cleaned.gsub(/\s+-\s+(\d{4})\b/, '-\1')

  # 3. Add missing "Std" after IEEE: "IEEE 1070-1995" → "IEEE Std 1070-1995"
  # Only at start of string, IEEE + space + digit
  cleaned = cleaned.gsub(/^IEEE\s+(?!Std\b)(\d)/, 'IEEE Std \1')

  # 3.5. Convert "IEEE No." to "IEEE Std": "IEEE No. 264-1968" → "IEEE Std 264-1968"
  # NOTE: Do NOT convert AIEE No - AIEE uses "No" as standard format
  cleaned = cleaned.gsub(/^IEEE\s+No\.\s*/, "IEEE Std ")
  cleaned = cleaned.gsub(/^IEEE\s+No\s/, "IEEE Std ")
  # Skip AIEE No conversion - AIEE preserves "No" format

  # 4. Space before slash in dual published: "262-1973 /ANSI" → "262-1973/ANSI"
  cleaned = cleaned.gsub(/\s+\//, "/")

  # 5. Comma before Edition: ", 1998 Edition" → "-1998"
  # Normalize to standard year format for parser
  cleaned = cleaned.gsub(/,\s+(\d{4})\s+Edition/, '-\1')

  # 6. ISO/IEC spacing: "ISO/IEC15802" → "ISO/IEC 15802"
  # Add space between publisher prefix and number
  cleaned = cleaned.gsub(/(ISO\/IEC)(\d)/, '\1 \2')

  # Part B: Publisher Order (Line 38 from TODO)

  # Fix wrong publisher order: "IEEE Std ANSI/IEEE" → "ANSI/IEEE Std"
  # This handles cases where IEEE Std appears before ANSI/IEEE publisher
  cleaned = cleaned.gsub(/^IEEE\s+Std\s+(ANSI\/IEEE)/, '\1 Std')

  # Part C: Dual Published Formats (Lines 8, 19 from TODO)

  # 1. Semicolon to parenthetical for dual published (MultiLabeledIdentifier)
  # "IEEE Std 120-1955; ASME PTC 19.6-1955" → "IEEE Std 120-1955 (ASME PTC 19.6-1955)"
  # Only if semicolon + space + organization abbreviation (capital letters)
  if cleaned.match?(/;\s+[A-Z]{2,}/)
    cleaned = cleaned.sub(/;\s+([A-Z][^;]+)$/, ' (\1)')
  end

  # === SESSION 174: Additional TODO.IEEE-MUST-DO.txt Preprocessing ===

  # Part A: Edition Abbreviation Normalization (Lines 10-11)
  # Pattern: ", 1999 Edn. (Reaff 2003)" → "-1999 (R2003)"
  # Normalize both the Edition abbreviation and the Reaffirmed format
  cleaned = cleaned.gsub(/,\s+(\d{4})\s+Edn\.\s+\(Reaff\s+(\d{4})\)/,
                         '-\1 (R\2)')
  # Also handle without initial comma (might occur in relationships)
  cleaned = cleaned.gsub(/(\d{4})\s+Edn\.\s+\(Reaff\s+(\d{4})\)/,
                         '\1 (R\2)')

  # Part B: IRE Parenthetical Split (Line 9)
  # Pattern: "(Reaffirmed 1980, 56 IRE 28.S2)" → "(R1980) (56 IRE 28.S2)"
  # Split nested reaffirmation + IRE reference into two parentheticals
  cleaned = cleaned.gsub(/\(Reaffirmed\s+(\d{4}),\s+(\d+\s+IRE[^)]+)\)/,
                         '(R\1) (\2)')

  # Part C: Slash to Parenthetical (Line 37)
  # Pattern: "number-year/ANSI identifier" → "number-year (ANSI identifier)"
  # Only convert if slash is followed by ANSI and NOT a relationship keyword
  # Look ahead to ensure we're at end of main identifier (before paren or end of string)
  cleaned = cleaned.gsub(%r{(\d{4})/ANSI\s+([^(]+)(?=\s*\(|$)},
                         '\1 (ANSI \2)')

  # Part D: ISO/IEC TR Spacing (Line 40)
  # Pattern: "ISO/IEC TR11802" → "ISO/IEC TR 11802"
  # Add space after TR when directly followed by digit
  cleaned = cleaned.gsub(/(ISO\/IEC\s+TR)(\d)/, '\1 \2')
  # === SESSION 178: AIEE Dual Numbers Expansion (Line 45) ===

  # Part E: AIEE "Nos X and Y" Expansion
  # Pattern: "AIEE Nos 72 and 73 - 1932" → "AIEE No 72-1932 and AIEE No 73-1932"
  # Expands dual AIEE numbers to separate identifiers with shared year
  if cleaned.match?(/AIEE\s+Nos\s+(\d+)\s+and\s+(\d+)\s+-\s+(\d{4})/)
    cleaned = cleaned.sub(/AIEE\s+Nos\s+(\d+)\s+and\s+(\d+)\s+-\s+(\d{4})/) do
      first_num = $1
      second_num = $2
      year = $3
      "AIEE No #{first_num}-#{year} and AIEE No #{second_num}-#{year}"
    end
  end

  # === SESSION 222: TODO.IEEE-MUST-FIX-IDs.txt Comprehensive Fixes ===

  # Part A: Typo Fixes
  # 1. "Stad" -> "Std" (typo)
  cleaned = cleaned.gsub(/\bStad\b/, "Std")

  # 2. Lowercase "std" -> "Std" when after IEEE/ANSI publishers
  cleaned = cleaned.gsub(/\b(IEEE|ANSI|AIEE)\s+std\b/, '\1 Std')

  # Part B: Symbol Normalization
  # 3. Additional (TM) patterns - strip them out
  cleaned = cleaned.gsub("(TM)", "")

  # Part C: Year-first format normalization
  # 4. Pattern "62704-4/D4, 2020" -> "IEEE P62704-4/D4, 2020"
  # Only if starts with digits-dash-digits/D pattern
  if cleaned.match?(/^(\d+[-.]\d+)\/D\d+/)
    cleaned = "IEEE P#{cleaned}"
  end

  # Part D: Suffix Normalization
  # 5. "/Preprint" -> remove (data quality - not standard suffix)
  cleaned = cleaned.gsub(/\/Preprint\b/, "")

  # Part E: Relationship Text Normalization
  # 6. "Proposed Revision of" -> "Revision of"
  cleaned = cleaned.gsub("Proposed Revision of", "Revision of")

  # 7. "ammended" typo -> "amended"
  cleaned = cleaned.gsub(/\bammended\b/i, "amended")

  # Part F: Trailing Characters After Special Patterns
  # 8. Remove trailing periods after /INT, /Cor, etc.
  cleaned = cleaned.gsub(/(\/INT|\/Cor\s+\d+-\d{4})\./, '\1')

  # Part G: Conformance Pattern Spacing
  # 9. Fix spacing in "/Conformance" patterns WITHOUT year (malformed only)
  # "1904.1(TM)/Conformance02" -> "1904.1 /Conformance02" (space before slash)
  # BUT: DO NOT touch valid patterns like "802.16/Conformance01-2003" (with year)
  # Use positive check for year suffix to exclude valid patterns
  # Actually, this preprocessing is breaking valid patterns - just remove it entirely
  # The parser can handle both "6/Conformance01-2003" and "6 /Conformance02" formats

  # Part H: Edition Text After /INT
  # 10. Handle ", Month YYYY Edition" after /INT by converting to month-year format
  # "1003.1/INT, March 1994 Edition" -> "1003.1/INT, March 1994"
  cleaned = cleaned.gsub(/(\/INT),\s+([A-Z][a-z]+)\s+(\d{4})\s+Edition/,
                         '\1, \2 \3')

  # Part I: Handle "Ed." abbreviation
  # 11. "Dec. 1994 Ed." -> "Dec. 1994"
  cleaned = cleaned.gsub(/\s+Ed\.\s*$/, "")

  # === PHASE 2: High-impact preprocessing for fixture failures ===

  # Quick wins from SESSION 224 (must come before more complex fixes)

  # Remove period after "Std": "IEEE Std." -> "IEEE Std"
  cleaned = cleaned.gsub(/\bStd\.\s+/, "Std ")

  # Redline Suffix Removal: " - Redline" at end
  cleaned = cleaned.gsub(/\s+-\s+Redline\b.*$/, "")

  # Title portion removal after year: "YYYY - IEEE Standard for..."
  cleaned = cleaned.gsub(
    /(\d{4})(\s+\([^)]+\))?\s+-\s+IEEE\s+Standard\s+for.*$/, '\1\2'
  )

  # Fix 2A: "IEEE PC" prefix -> "IEEE Std PC" or "IEEE P" treatment
  # "IEEE PC37.20.9/D7.3A" -> needs to parse as IEEE project draft
  # Strategy: Add "Std" after "IEEE" when followed by "PC" to route to standard pattern
  # Actually, the issue is the number rule consumes "PC37" as P + C37.
  # Better: normalize "IEEE PC" to "IEEE Std PC" so it hits the standard identifier path
  cleaned = cleaned.gsub(/^IEEE\s+PC(\d)/, 'IEEE Std PC\1')
  cleaned = cleaned.gsub(/^IEEE\s+Unapproved\s+Draft\s+Std\s+PC(\d)/,
                         'IEEE Unapproved Draft Std PC\1')

  # Fix 2B: "IEEE P" without "Std"/"Draft" prefix
  # ieee_p_identifier rule handles these directly - no preprocessing needed
  # Only handle "IEEE P" followed by "and ASHRAE" (copub case)
  cleaned = cleaned.gsub(/^IEEE\s+P(\d+)\s+and\s+ASHRAE/,
                         'IEEE Std P\1 and ASHRAE')

  # Fix 2C: "ISO/IEC XXXX-YYYY: Title" -> strip title after colon for ISO/IEC published standards
  # These are ISO-format identifiers with IEEE adoption, strip the title
  cleaned = cleaned.gsub(/^(ISO\/IEC \d+[-.]\d+-\d{4}):.*$/, '\1')
  cleaned = cleaned.gsub(/^(ISO\/IEC \d+-\d{4}):.*$/, '\1')

  # Fix 2D: "ISO/IEC XXXX : YYYY" -> normalize spacing around colon
  cleaned = cleaned.gsub(/^(ISO\/IEC \d+[-.]\d*)\s*:\s*(\d{4})/, '\1:\2')
  cleaned = cleaned.gsub(/^(ISO\/IEC \d+)\s*:\s*(\d{4})/, '\1:\2')

  # Fix 2G: "IEC/IEEE PXXX_D5" -> underscore to slash
  cleaned = cleaned.gsub(/^(IEC\/IEEE P[\w.-]+)_D/, '\1/D')

  # Fix 2H: "IEC XXXX First edition YYYY-MM; IEEE NNNN" -> normalize semicolon
  # Already handled by earlier semicolon normalization

  # Fix 2I: "IEEE/ISO/IEC PXXX/DIS" -> normalize to "ISO/IEC/IEEE PXXX/DIS"
  cleaned = cleaned.gsub(/^IEEE\/ISO\/IEC\s+(P[\w.-]+)/,
                         'ISO/IEC/IEEE \1')
  cleaned = cleaned.gsub(/^IEEE\/IEC\/ISO\s+(P[\w.-]+)/,
                         'IEC/ISO/IEEE \1')

  # Fix 2J: "IEEE/IEC PXXX D5" -> normalize space to slash before D
  cleaned = cleaned.gsub(/^(IEEE\/IEC P[\w.-]+)\s+D(\d)/, '\1/D\2')
  cleaned = cleaned.gsub(
    /^(IEEE\/IEC P[\w.-]+)\s+(CDV|FDIS|CD|DIS|ED\d)/, '\1/\2'
  )

  # Fix 2K: "ISO /IEC/IEEE" -> fix space before slash
  cleaned = cleaned.gsub(/^ISO\s+\/IEC\/IEEE/, "ISO/IEC/IEEE")
  cleaned = cleaned.gsub(/^ISO\s+\/IEC/, "ISO/IEC")

  # Fix 2L: "IS0" typo (letter O instead of digit 0)
  cleaned = cleaned.gsub(/^IS0\//, "ISO/")

  # Fix 2M: "IEEE-P15026-3-DIS-January 2015" -> dash-separated format
  # Normalize to "ISO/IEC/IEEE P15026-3/DIS, January 2015"
  cleaned = cleaned.gsub(/^IEEE-P(\d+)-(\d+)-DIS-(.*)/,
                         'ISO/IEC/IEEE P\1-\2/DIS, \3')

  # Fix 2N: "IEEE/CSA P844.1/293.1/D2" -> normalize CSA dual numbering
  cleaned = cleaned.gsub(/^IEEE\/CSA\s+(P[\d.]+)\/([\d.]+)\/D(\d+)/,
                         'IEEE/CSA \1/D\3')

  # Fix 2O: "IEEE Approved Draft Std P" -> normalize spacing
  cleaned = cleaned.gsub(/^IEEE\s+Approved\s+Draft\s+Std\s+(P\d)/,
                         'IEEE Approved Draft Std \1')
  # Fix: "IEEE Approved Draft Std P1234 / D12" -> remove space before slash
  cleaned = cleaned.gsub(/^(IEEE Approved Draft Std P[\w.-]+)\s+\/\s*D/,
                         '\1/D')

  # Fix 2P: "IEEE/EIA" -> normalize (parser handles IEEE/EIA via copublisher)
  # Already works - no fix needed

  # Fix 2Q: AIEE format variations
  # "AIEE No.1C-1954" -> "AIEE No. 1C-1954" (add space after No.)
  cleaned = cleaned.gsub(/^AIEE\s+No\.\s*(\d)/, 'AIEE No. \1')
  # "AIEE no 700-1945" -> "AIEE No 700-1945" (capitalize)
  cleaned = cleaned.gsub(/^AIEE\s+no\s/, "AIEE No ")
  # "AIEE Std No. 800" -> "AIEE Standard No 800" (normalize type word)
  cleaned = cleaned.gsub(/^AIEE\s+Std\s+No\.\s*/, "AIEE Standard No ")
  # "AIEE No 750.1-1960" -> handled by AIEE parser if decimal support added

  # Fix 2R: "IEEE PSI 10/D2" -> normalize to "IEEE/ASTM PSI 10/D2"
  cleaned = cleaned.gsub(/^IEEE\s+PSI\s+(\d)/, 'IEEE/ASTM PSI \1')

  # Fix 2S: "IEEE/IEC P62271-111/PC37.60_D5" -> normalize
  cleaned = cleaned.gsub(/^(IEEE\/IEC P[\d.-]+\/PC[\d.]+)_D/, '\1/D')

  # Fix 2T: "IEC P62271-111/IEEE PC37.60_D5" -> normalize to IEC/IEEE format
  cleaned = cleaned.gsub(/^IEC\s+(P[\d.-]+)\/IEEE\s+(PC[\d.]+)_D/,
                         'IEC/IEEE \2/D')

  # Fix 2U: "IEC/IEC P" -> "IEC/IEEE P" (typo)
  cleaned = cleaned.gsub(/^IEC\/IEC\s+(P\d)/, 'IEC/IEEE \1')

  # Fix 2V: "NACE SPXXXX-YYYY/IEEE Std NNNN-YYYY" -> normalize slash to parenthetical
  cleaned = cleaned.gsub(/^(NACE\s+SP\d+-\d+)\/(IEEE\s+Std\s+\d+-\d+)$/,
                         '\1 (\2)')

  # Fix 2W: "IEEE Std 802.11g-2003 (Amendment to IEEE Std 802.11, 1999 Edn. (Reaff 2003) as amended by"
  # This is a complex relationship - strip the parenthetical if too complex
  # Let the parser handle it but fix "Edn." to "Edition"
  cleaned = cleaned.gsub("Edn.", "Edition")

  # Fix 2X: "IEEE-P15026-3-DIS" format -> normalize
  # Already handled by Fix 2M

  # Fix 2Y: "P1635/D10/ASHARE 21/D10" -> fix ASHARE typo to ASHRAE
  cleaned = cleaned.gsub("ASHARE", "ASHRAE")

  # Fix 2Z: "PC37.30.2/D043 Rev 18" -> normalize draft version with Rev
  # "PC57-15 D2.0" -> normalize to "P57-15/D2.0"
  cleaned = cleaned.gsub(/^PC(\d)/, 'P\1')

  # Fix 2AA: "IEEE/ISO/IEC 8802-1Q-2020/Amd31-2021" -> normalize
  cleaned = cleaned.gsub(/^IEEE\/ISO\/IEC\s+(8802[\w.-]+)/,
                         'ISO/IEC/IEEE \1')

  # Fix 2AB: "IEEE C57.139/D14June 2010" -> add missing space
  cleaned = cleaned.gsub(
    /^(IEEE\s+C?\d[\d.]*\/D\d+)([A-Z][a-z]+\s+\d{4})/, '\1, \2'
  )

  # Fix 2AC: "IEEE Std: Title" -> strip colon and title (ANSI/IEEE Std: )
  cleaned = cleaned.gsub(/^(ANSI\/IEEE Std):\s+.*$/, '\1')

  # Fix 2AD: "IEEE 1076 IEC 61691-1-1 First edition 2004-10" -> semicolon format
  cleaned = cleaned.gsub(
    /^(IEEE\s+[\d.]+)\s+(IEC\s+\d+[-\d]*\s+.*edition\s+\d{4}-\d{2})$/i, '\1; \2'
  )

  # Fix 2AE: "IEEE No 29-1941 / ASA C77.1-1943" -> normalize to IEEE Std format
  cleaned = cleaned.gsub(/^IEEE\s+No\s+(\d+-\d+)\s+\/\s+ASA\s+(.*)/,
                         'IEEE Std \1 (ASA \2)')

  # Fix 2AF: "IEEE Std 1003.1/2003.l/lNT" -> fix typos
  # .l -> .1 and lNT -> INT handled by existing fixes

  new.parse(cleaned)
end