Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#if _runtime(_ObjC)
import Foundation

// Cache of opened files
// Cache of opened files
var cachedFiles: [String: String] = [:]

func readInputFile(_ filename: String) -> String {
Expand Down Expand Up @@ -128,10 +128,10 @@ func parseBinaryProperties(

let info = line.split(separator: "#")
let components = info[0].split(separator: ";")

// Get the property first because we may not care about it.
let filteredProperty = components[1].filter { !$0.isWhitespace }

guard availableBinaryProperties.contains(filteredProperty) else {
continue
}
Expand Down Expand Up @@ -187,12 +187,12 @@ func parseNumericTypes(
guard !line.hasPrefix("#") else {
continue
}

let info = line.split(separator: "#")
let components = info[0].split(separator: ";")

let filteredProperty = components[1].filter { !$0.isWhitespace }

let numericType: Unicode.NumericType

switch filteredProperty {
Expand All @@ -205,9 +205,9 @@ func parseNumericTypes(
default:
continue
}

let filteredScalars = components[0].filter { !$0.isWhitespace }

let scalars = parseScalars(String(filteredScalars))

for scalar in scalars {
Expand All @@ -225,12 +225,12 @@ func parseNumericValues(
guard !line.hasPrefix("#") else {
continue
}

let info = line.split(separator: "#")
let components = info[0].split(separator: ";")

let filteredProperty = components[3].filter { !$0.isWhitespace }

let value: Double

// If we have a division, split the numerator and denominator and perform
Expand All @@ -247,7 +247,7 @@ func parseNumericValues(
}

let filteredScalars = components[0].filter { !$0.isWhitespace }

let scalars = parseScalars(String(filteredScalars))

for scalar in scalars {
Expand Down Expand Up @@ -286,7 +286,7 @@ func parseMappings(
) {
for line in data.split(separator: "\n") {
let components = line.split(separator: ";", omittingEmptySubsequences: false)

let scalarStr = components[0]
guard let scalar = Unicode.Scalar(UInt32(scalarStr, radix: 16)!) else {
continue
Expand All @@ -303,7 +303,7 @@ func parseMappings(

result[scalar, default: [:]]["lower"] = mapping
}

if let title = UInt32(components[14], radix: 16) {
let mapping = String(Unicode.Scalar(title)!)

Expand All @@ -320,27 +320,27 @@ func parseSpecialMappings(
guard !line.hasPrefix("#") else {
continue
}

let components = line.split(separator: ";", omittingEmptySubsequences: false)

// Conditional mappings have an extra component with the conditional name.
// Ignore those.
guard components.count == 5 else {
continue
}

guard let scalar = Unicode.Scalar(UInt32(components[0], radix: 16)!) else {
continue
}

let lower = components[1].split(separator: " ").map {
Character(Unicode.Scalar(UInt32($0, radix: 16)!)!)
}

let title = components[2].split(separator: " ").map {
Character(Unicode.Scalar(UInt32($0, radix: 16)!)!)
}

let upper = components[3].split(separator: " ").map {
Character(Unicode.Scalar(UInt32($0, radix: 16)!)!)
}
Expand Down Expand Up @@ -369,7 +369,7 @@ public let mappings: [Unicode.Scalar: [String: String]] = {
#else
let unicodeData = readInputFile("UnicodeData.txt")
#endif

let specialCasing = readInputFile("SpecialCasing.txt")

parseMappings(unicodeData, into: &result)
Expand Down Expand Up @@ -651,22 +651,22 @@ func parseCaseFoldings(
guard !line.hasPrefix("#") else {
continue
}

let components = line.split(separator: ";")

let status = components[1].filter { !$0.isWhitespace }

// We only care about Common and Full case mappings.
guard status == "C" || status == "F" else {
continue
}

let scalar = Unicode.Scalar(parseScalars(String(components[0])).lowerBound)!

let mapping = components[2].split(separator: " ").map {
Unicode.Scalar(UInt32($0, radix: 16)!)!
}

var mappingString = ""

for scalar in mapping {
Expand Down Expand Up @@ -710,6 +710,7 @@ extension Unicode {
case bassaVah = "Bassa_Vah"
case batak = "Batak"
case bengali = "Bengali"
case beriaErfe = "Beria_Erfe"
case bhaiksuki = "Bhaiksuki"
case bopomofo = "Bopomofo"
case brahmi = "Brahmi"
Expand Down Expand Up @@ -835,6 +836,7 @@ extension Unicode {
case sharada = "Sharada"
case shavian = "Shavian"
case siddham = "Siddham"
case sidetic = "Sidetic"
case signWriting = "SignWriting"
case sinhala = "Sinhala"
case sogdian = "Sogdian"
Expand All @@ -849,6 +851,7 @@ extension Unicode {
case taiLe = "Tai_Le"
case taiTham = "Tai_Tham"
case taiViet = "Tai_Viet"
case taiYo = "Tai_Yo"
case takri = "Takri"
case tamil = "Tamil"
case tangsa = "Tangsa"
Expand All @@ -860,6 +863,7 @@ extension Unicode {
case tifinagh = "Tifinagh"
case tirhuta = "Tirhuta"
case todhri = "Todhri"
case tolongSiki = "Tolong_Siki"
case toto = "Toto"
case tuluTigalari = "Tulu_Tigalari"
case ugaritic = "Ugaritic"
Expand Down Expand Up @@ -922,6 +926,7 @@ func classifyScriptProperty(
case "bass", "bassavah": return .bassaVah
case "batk", "batak": return .batak
case "beng", "bengali": return .bengali
case "berf", "beriaerfe": return .beriaErfe
case "bhks", "bhaiksuki": return .bhaiksuki
case "bopo", "bopomofo": return .bopomofo
case "brah", "brahmi": return .brahmi
Expand Down Expand Up @@ -1041,6 +1046,7 @@ func classifyScriptProperty(
case "shaw", "shavian": return .shavian
case "shrd", "sharada": return .sharada
case "sidd", "siddham": return .siddham
case "sidt", "sidetic": return .sidetic
case "sind", "khudawadi": return .khudawadi
case "sinh", "sinhala": return .sinhala
case "sogd", "sogdian": return .sogdian
Expand All @@ -1058,6 +1064,7 @@ func classifyScriptProperty(
case "taml", "tamil": return .tamil
case "tang", "tangut": return .tangut
case "tavt", "taiviet": return .taiViet
case "tayo", "taiyo": return .taiYo
case "telu", "telugu": return .telugu
case "tfng", "tifinagh": return .tifinagh
case "tglg", "tagalog": return .tagalog
Expand All @@ -1067,6 +1074,7 @@ func classifyScriptProperty(
case "tirh", "tirhuta": return .tirhuta
case "tnsa", "tangsa": return .tangsa
case "todr", "todhri": return .todhri
case "tols", "tolongsiki": return .tolongSiki
case "toto": return .toto
case "tutg", "tulutigalari": return .tuluTigalari
case "ugar", "ugaritic": return .ugaritic
Expand Down
2 changes: 1 addition & 1 deletion stdlib/private/StdlibUnicodeUnittest/WordBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func parseWordBreakTests(

// If this is a break, record the +1 count. Otherwise it is × which is
// not a break.
if components[i] == "÷" {
if components[i].hasPrefix("÷") {
words.append("")
}
}
Expand Down
23 changes: 10 additions & 13 deletions stdlib/public/core/StringGraphemeBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ extension _StringGuts {
return 1
}
}

return _opaqueComplexCharacterStride(startingAt: i)
}

Expand Down Expand Up @@ -733,12 +733,12 @@ extension _GraphemeBreakingState {
}

let x = Unicode._GraphemeBreakProperty(from: scalar1)

// GB4 handled here because we don't need to know `y` for this case
if x == .control {
return true
}

// This variable and the defer statement help toggle the isInEmojiSequence
// state variable to false after every decision of 'shouldBreak'. If we
// happen to see a rhs .extend or .zwj, then it's a signal that we should
Expand All @@ -752,7 +752,7 @@ extension _GraphemeBreakingState {
isInEmojiSequence = enterEmojiSequence
isInIndicSequence = enterIndicSequence
}

let y = Unicode._GraphemeBreakProperty(from: scalar2)

switch (x, y) {
Expand Down Expand Up @@ -800,7 +800,7 @@ extension _GraphemeBreakingState {
// sequence; the sequence continues through subsequent extend/extend and
// extend/zwj pairs.
if (
x == .extendedPictographic || (isInEmojiSequence && x == .extend)
scalar1._isExtendedPictographic || (isInEmojiSequence && x == .extend)
) {
enterEmojiSequence = true
}
Expand Down Expand Up @@ -859,7 +859,7 @@ extension _GraphemeBreakingState {
return false

// GB11
case (.zwj, .extendedPictographic):
case (.zwj, _) where scalar2._isExtendedPictographic:
return !isInEmojiSequence

// GB12 & GB13
Expand Down Expand Up @@ -952,7 +952,7 @@ fileprivate func _shouldBreakWithLookback(
return false

// GB11
case (.zwj, .extendedPictographic):
case (.zwj, _) where scalar2._isExtendedPictographic:
return !_checkIfInEmojiSequence(at: index, with: previousScalar)

// GB12 & GB13
Expand Down Expand Up @@ -1030,14 +1030,11 @@ fileprivate func _checkIfInEmojiSequence(
i = prev.start
let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar)

switch gbp {
case .extend:
if gbp == .extend {
continue
case .extendedPictographic:
return true
default:
return false
}

return prev.scalar._isExtendedPictographic
}
return false
}
Expand Down
26 changes: 14 additions & 12 deletions stdlib/public/core/StringWordBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,6 @@ extension Unicode._WordRecognizer {
}

switch (_prevCategory, nextCategory) {
case (.any, .any): // WB999
// Fast path: If we know our scalars have no properties then the decision
// is trivial and we don't need to crawl to the default statement.
return _accept()

case (.newlineCRLF, _), // WB3a
(_, .newlineCRLF): // WB3b
if _prevScalar.value == 0xD, nextScalar.value == 0xA { // WB3
Expand All @@ -278,8 +273,10 @@ extension Unicode._WordRecognizer {
}
return _accept()

case (.zwj, .extendedPictographic), // WB3c
(.wSegSpace, .wSegSpace): // WB3d
case (.wSegSpace, .wSegSpace): // WB3d
return _reject()

case (.zwj, _) where nextScalar._isExtendedPictographic: // WB3c
return _reject()

case (_, .format), // WB4
Expand Down Expand Up @@ -363,6 +360,9 @@ extension Unicode._WordRecognizer {
}
return (setCandidate: false, breakAtCandidate: false, breakHere: breakHere)

case (.any, .any): // WB999
return _accept()

default: // WB999
return _accept()
}
Expand Down Expand Up @@ -513,22 +513,21 @@ extension Unicode._RandomAccessWordRecognizer {
}

switch (prevCategory, _nextCategory) {
case (.any, .any): // WB999 shortcut
return _accept()

case (.newlineCRLF, _), // WB3a
(_, .newlineCRLF): // WB3b
if previousScalar.value == 0xD, _nextScalar.value == 0xA { // WB3
return _reject()
}
return _accept()

case (.zwj, .extendedPictographic), // WB3c
(.wSegSpace, .wSegSpace): // WB3d
case (.wSegSpace, .wSegSpace): // WB3d
newBase = _baseCategory
newState = _state
return _reject()

case (.zwj, _) where _nextScalar._isExtendedPictographic: // WB3c
return _reject()

case (.format, _), // WB4
(.extend, _),
(.zwj, _):
Expand Down Expand Up @@ -625,6 +624,9 @@ extension Unicode._RandomAccessWordRecognizer {
newState = .initial
return _reject()

case (.any, .any): // WB999
return _accept()

default:
if
!_hasPendingCandidate,
Expand Down
Loading