swiftlang · Azoy · Jan 3, 2026 · Dec 10, 2025 · Dec 15, 2025 · Dec 16, 2025
@@ -15,7 +15,7 @@
 #if _runtime(_ObjC)
 import Foundation
 
-// Cache of opened files 
+// Cache of opened files
 var cachedFiles: [String: String] = [:]
 
 func readInputFile(_ filename: String) -> String {
@@ -128,10 +128,10 @@ func parseBinaryProperties(
 
     let info = line.split(separator: "#")
     let components = info[0].split(separator: ";")
-    
+
     // Get the property first because we may not care about it.
     let filteredProperty = components[1].filter { !$0.isWhitespace }
-    
+
     guard availableBinaryProperties.contains(filteredProperty) else {
       continue
     }
@@ -187,12 +187,12 @@ func parseNumericTypes(
     guard !line.hasPrefix("#") else {
       continue
     }
-    
+
     let info = line.split(separator: "#")
     let components = info[0].split(separator: ";")
-    
+
     let filteredProperty = components[1].filter { !$0.isWhitespace }
-    
+
     let numericType: Unicode.NumericType
 
     switch filteredProperty {
@@ -205,9 +205,9 @@ func parseNumericTypes(
     default:
       continue
     }
-    
+
     let filteredScalars = components[0].filter { !$0.isWhitespace }
-    
+
     let scalars = parseScalars(String(filteredScalars))
 
     for scalar in scalars {
@@ -225,12 +225,12 @@ func parseNumericValues(
     guard !line.hasPrefix("#") else {
       continue
     }
-    
+
     let info = line.split(separator: "#")
     let components = info[0].split(separator: ";")
-    
+
     let filteredProperty = components[3].filter { !$0.isWhitespace }
-    
+
     let value: Double
 
     // If we have a division, split the numerator and denominator and perform
@@ -247,7 +247,7 @@ func parseNumericValues(
     }
 
     let filteredScalars = components[0].filter { !$0.isWhitespace }
-    
+
     let scalars = parseScalars(String(filteredScalars))
 
     for scalar in scalars {
@@ -286,7 +286,7 @@ func parseMappings(
 ) {
   for line in data.split(separator: "\n") {
     let components = line.split(separator: ";", omittingEmptySubsequences: false)
-    
+
     let scalarStr = components[0]
     guard let scalar = Unicode.Scalar(UInt32(scalarStr, radix: 16)!) else {
       continue
@@ -303,7 +303,7 @@ func parseMappings(
 
       result[scalar, default: [:]]["lower"] = mapping
     }
-    
+
     if let title = UInt32(components[14], radix: 16) {
       let mapping = String(Unicode.Scalar(title)!)
 
@@ -320,27 +320,27 @@ func parseSpecialMappings(
     guard !line.hasPrefix("#") else {
       continue
     }
-    
+
     let components = line.split(separator: ";", omittingEmptySubsequences: false)
-    
+
     // Conditional mappings have an extra component with the conditional name.
     // Ignore those.
     guard components.count == 5 else {
       continue
     }
-    
+
     guard let scalar = Unicode.Scalar(UInt32(components[0], radix: 16)!) else {
       continue
     }
-    
+
     let lower = components[1].split(separator: " ").map {
       Character(Unicode.Scalar(UInt32($0, radix: 16)!)!)
     }
-    
+
     let title = components[2].split(separator: " ").map {
       Character(Unicode.Scalar(UInt32($0, radix: 16)!)!)
     }
-    
+
     let upper = components[3].split(separator: " ").map {
       Character(Unicode.Scalar(UInt32($0, radix: 16)!)!)
     }
@@ -369,7 +369,7 @@ public let mappings: [Unicode.Scalar: [String: String]] = {
   #else
   let unicodeData = readInputFile("UnicodeData.txt")
   #endif
-  
+
   let specialCasing = readInputFile("SpecialCasing.txt")
 
   parseMappings(unicodeData, into: &result)
@@ -651,22 +651,22 @@ func parseCaseFoldings(
     guard !line.hasPrefix("#") else {
       continue
     }
-    
+
     let components = line.split(separator: ";")
-    
+
     let status = components[1].filter { !$0.isWhitespace }
-    
+
     // We only care about Common and Full case mappings.
     guard status == "C" || status == "F" else {
       continue
     }
-    
+
     let scalar = Unicode.Scalar(parseScalars(String(components[0])).lowerBound)!
-    
+
     let mapping = components[2].split(separator: " ").map {
       Unicode.Scalar(UInt32($0, radix: 16)!)!
     }
-    
+
     var mappingString = ""
 
     for scalar in mapping {
@@ -710,6 +710,7 @@ extension Unicode {
     case bassaVah = "Bassa_Vah"
     case batak = "Batak"
     case bengali = "Bengali"
+    case beriaErfe = "Beria_Erfe"
     case bhaiksuki = "Bhaiksuki"
     case bopomofo = "Bopomofo"
     case brahmi = "Brahmi"
@@ -835,6 +836,7 @@ extension Unicode {
     case sharada = "Sharada"
     case shavian = "Shavian"
     case siddham = "Siddham"
+    case sidetic = "Sidetic"
     case signWriting = "SignWriting"
     case sinhala = "Sinhala"
     case sogdian = "Sogdian"
@@ -849,6 +851,7 @@ extension Unicode {
     case taiLe = "Tai_Le"
     case taiTham = "Tai_Tham"
     case taiViet = "Tai_Viet"
+    case taiYo = "Tai_Yo"
     case takri = "Takri"
     case tamil = "Tamil"
     case tangsa = "Tangsa"
@@ -860,6 +863,7 @@ extension Unicode {
     case tifinagh = "Tifinagh"
     case tirhuta = "Tirhuta"
     case todhri = "Todhri"
+    case tolongSiki = "Tolong_Siki"
     case toto = "Toto"
     case tuluTigalari = "Tulu_Tigalari"
     case ugaritic = "Ugaritic"
@@ -922,6 +926,7 @@ func classifyScriptProperty(
     case "bass", "bassavah":              return .bassaVah
     case "batk", "batak":                 return .batak
     case "beng", "bengali":               return .bengali
+    case "berf", "beriaerfe":             return .beriaErfe
     case "bhks", "bhaiksuki":             return .bhaiksuki
     case "bopo", "bopomofo":              return .bopomofo
     case "brah", "brahmi":                return .brahmi
@@ -1041,6 +1046,7 @@ func classifyScriptProperty(
     case "shaw", "shavian":               return .shavian
     case "shrd", "sharada":               return .sharada
     case "sidd", "siddham":               return .siddham
+    case "sidt", "sidetic":               return .sidetic
     case "sind", "khudawadi":             return .khudawadi
     case "sinh", "sinhala":               return .sinhala
     case "sogd", "sogdian":               return .sogdian
@@ -1058,6 +1064,7 @@ func classifyScriptProperty(
     case "taml", "tamil":                 return .tamil
     case "tang", "tangut":                return .tangut
     case "tavt", "taiviet":               return .taiViet
+    case "tayo", "taiyo":                 return .taiYo
     case "telu", "telugu":                return .telugu
     case "tfng", "tifinagh":              return .tifinagh
     case "tglg", "tagalog":               return .tagalog
@@ -1067,6 +1074,7 @@ func classifyScriptProperty(
     case "tirh", "tirhuta":               return .tirhuta
     case "tnsa", "tangsa":                return .tangsa
     case "todr", "todhri":                return .todhri
+    case "tols", "tolongsiki":            return .tolongSiki
     case "toto":                          return .toto
     case "tutg", "tulutigalari":          return .tuluTigalari
     case "ugar", "ugaritic":              return .ugaritic

@@ -42,7 +42,7 @@ func parseWordBreakTests(
 
         // If this is a break, record the +1 count. Otherwise it is × which is
         // not a break.
-        if components[i] == "÷" {
+        if components[i].hasPrefix("÷") {
           words.append("")
         }
       }

@@ -221,7 +221,7 @@ extension _StringGuts {
         return 1
       }
     }
-    
+
     return _opaqueComplexCharacterStride(startingAt: i)
   }
 
@@ -733,12 +733,12 @@ extension _GraphemeBreakingState {
     }
 
     let x = Unicode._GraphemeBreakProperty(from: scalar1)
-    
+
     // GB4 handled here because we don't need to know `y` for this case
     if x == .control {
       return true
     }
-    
+
     // This variable and the defer statement help toggle the isInEmojiSequence
     // state variable to false after every decision of 'shouldBreak'. If we
     // happen to see a rhs .extend or .zwj, then it's a signal that we should
@@ -752,7 +752,7 @@ extension _GraphemeBreakingState {
       isInEmojiSequence = enterEmojiSequence
       isInIndicSequence = enterIndicSequence
     }
-    
+
     let y = Unicode._GraphemeBreakProperty(from: scalar2)
 
     switch (x, y) {
@@ -800,7 +800,7 @@ extension _GraphemeBreakingState {
       // sequence; the sequence continues through subsequent extend/extend and
       // extend/zwj pairs.
       if (
-        x == .extendedPictographic || (isInEmojiSequence && x == .extend)
+        scalar1._isExtendedPictographic || (isInEmojiSequence && x == .extend)
       ) {
         enterEmojiSequence = true
       }
@@ -859,7 +859,7 @@ extension _GraphemeBreakingState {
       return false
 
     // GB11
-    case (.zwj, .extendedPictographic):
+    case (.zwj, _) where scalar2._isExtendedPictographic:
       return !isInEmojiSequence
 
     // GB12 & GB13
@@ -952,7 +952,7 @@ fileprivate func _shouldBreakWithLookback(
     return false
 
   // GB11
-  case (.zwj, .extendedPictographic):
+  case (.zwj, _) where scalar2._isExtendedPictographic:
     return !_checkIfInEmojiSequence(at: index, with: previousScalar)
 
   // GB12 & GB13
@@ -1030,14 +1030,11 @@ fileprivate func _checkIfInEmojiSequence(
     i = prev.start
     let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar)
 
-    switch gbp {
-    case .extend:
+    if gbp == .extend {
       continue
-    case .extendedPictographic:
-      return true
-    default:
-      return false
     }
+
+    return prev.scalar._isExtendedPictographic
   }
   return false
 }

@@ -265,11 +265,6 @@ extension Unicode._WordRecognizer {
     }
 
     switch (_prevCategory, nextCategory) {
-    case (.any, .any): // WB999
-      // Fast path: If we know our scalars have no properties then the decision
-      // is trivial and we don't need to crawl to the default statement.
-      return _accept()
-
     case (.newlineCRLF, _), // WB3a
          (_, .newlineCRLF): // WB3b
       if _prevScalar.value == 0xD, nextScalar.value == 0xA { // WB3
@@ -278,8 +273,10 @@ extension Unicode._WordRecognizer {
       }
       return _accept()
 
-    case (.zwj, .extendedPictographic), // WB3c
-         (.wSegSpace, .wSegSpace): // WB3d
+    case (.wSegSpace, .wSegSpace): // WB3d
+      return _reject()
+
+    case (.zwj, _) where nextScalar._isExtendedPictographic: // WB3c
       return _reject()
 
     case (_, .format), // WB4
@@ -363,6 +360,9 @@ extension Unicode._WordRecognizer {
       }
       return (setCandidate: false, breakAtCandidate: false, breakHere: breakHere)
 
+    case (.any, .any): // WB999
+      return _accept()
+
     default: // WB999
       return _accept()
     }
@@ -513,22 +513,21 @@ extension Unicode._RandomAccessWordRecognizer {
     }
 
     switch (prevCategory, _nextCategory) {
-    case (.any, .any): // WB999 shortcut
-      return _accept()
-
     case (.newlineCRLF, _), // WB3a
          (_, .newlineCRLF): // WB3b
       if previousScalar.value == 0xD, _nextScalar.value == 0xA { // WB3
         return _reject()
       }
       return _accept()
 
-    case (.zwj, .extendedPictographic), // WB3c
-         (.wSegSpace, .wSegSpace): // WB3d
+    case (.wSegSpace, .wSegSpace): // WB3d
       newBase = _baseCategory
       newState = _state
       return _reject()
 
+    case (.zwj, _) where _nextScalar._isExtendedPictographic: // WB3c
+      return _reject()
+
     case (.format, _), // WB4
          (.extend, _),
          (.zwj, _):
@@ -625,6 +624,9 @@ extension Unicode._RandomAccessWordRecognizer {
       newState = .initial
       return _reject()
 
+    case (.any, .any): // WB999
+      return _accept()
+
     default:
       if
         !_hasPendingCandidate,