Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .prettierrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"trailingComma": "es5"
}
8 changes: 6 additions & 2 deletions components/language-chooser/common/find-language/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,15 @@ If you modify [langtagProcessing.ts](./langtagProcessing.ts), run `npm run find-

find-language searches languages included in the ISO-639-3 standard; every result returned will have a unique ISO-639-3 code. The entries listed in our source database, langtags.json, are combinations of languages, scripts, regions, and/or variants. [langtagProcessing.ts](./langtagProcessing.ts) consolidates these entries by their ISO-639-3 code and saves the result to [languageData.json](language-data/languageData.json) for searching. For example, langtags.json has separate entries for Abhaz with Cyrillic script, Abhaz with Georgian script, and Abhaz with Latin script. langtagProcessing.ts will combine these into a single entry which lists all three possible scripts and has the superset of the names, regions, etc. of the three entries from langtags.json. This way the search results will contain at most one entry for the language Abhaz.

[langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt) lists equivalent language tags, and langtagProcessing.ts reformats it into [equivalentTags.json](language-data/equivalentTags.json) which we use for mapping language tags to their shortest and maximal equivalents.

**Note: In this package we use individual language tags instead of the macrolanguage tags for individual languages, even if it is common/"canonical" to use a macrolangauge tag for that individual language. See [macrolanguageNotes.md](macrolanguageNotes.md) for details. So in [languageData.json](language-data/languageData.json), the the `languageSubtag` field will always be a specifically individual language code for individual languages.** (There are a few exceptional cases, see [macrolanguageNotes.md](macrolanguageNotes.md).) **However, equivalentTags.json generally contains tags in their "canonical" form.** Use utilities in [languageTagUtils.ts](./languageTagUtils.ts) to convert between "canonical" and specifically individual language tags.

#### Language tag shortening

The [createTag](./languageTagUtils.ts) function in this package will return the shortest (and thus preferred) tag for a given language/script/region/dialect combination. For example, given language code "emm" (Mamulique), script code "Latn" (Latin) and region code "MX" (Mexico), `createTag` will return "emm" because it is the preferred equivalent tag for emm-Latn-MX.
The [createTagFromOrthography](./languageTagUtils.ts) function in this package will return the shortest (and thus preferred) tag for a given language/script/region/dialect combination. For example, given language code "emm" (Mamulique), script code "Latn" (Latin) and region code "MX" (Mexico), `createTagFromOrthography` will return "emm" because it is the preferred equivalent tag for emm-Latn-MX.

[langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt) lists equivalent language tags. langtagProcessing.ts reformats it into [equivalentTags.json](language-data/equivalentTags.json) which we use for mapping language tags to their shortest and maximal equivalents.
For languages that are representative for a macrolanguage, however, we use the individual language's ISO 639-3 code as the language subtag even if the macrolanguage code is considered the preferred equivalent tag for this language. See [macrolanguageNotes.md](macrolanguageNotes.md) for further explanation.

## Data sources

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ export interface ILanguage {
autonym?: string;
exonym: string;
iso639_3_code: string; // ISO 639-3 code
languageSubtag: string; // BCP-47 canonical code
languageSubtag: string; // BCP-47 canonical code, except for representative languages where the canonical code is actually the macrolanguage code, in which case we use a specifically individual language code instead
regionNamesForDisplay: string; // For macrolanguages, we display a region but don't want the macrolanguage to come up in searches for that region
regionNamesForSearch: string[];
names: string[];
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -519,3 +519,21 @@ describe("other language object types", async () => {
expect(allAsyncResults[1].id).toEqual("second_eng_result");
});
});

// The first entry of alternativeTags should be a canonical tag for some form of the language, and so its subtag should
// be canonical for use in ensureLangSubtagIsCanonicalForReps
describe("canonical tag is first in alternativeTags", () => {
it("should have the canonical tag first in alternativeTags", () => {
const uzn = getLanguageBySubtag("uzn");
expect(uzn).toBeDefined();
expect(uzn?.alternativeTags[0].split("-")[0]).toBe("uz");

const ave = getLanguageBySubtag("ave");
expect(ave).toBeDefined();
expect(ave?.alternativeTags[0].split("-")[0]).toBe("ae");

const ojg = getLanguageBySubtag("ojg");
expect(ojg).toBeDefined();
expect(ojg?.alternativeTags[0].split("-")[0]).toBe("oj");
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ import {
createTag,
createTagFromOrthography,
defaultRegionForLangTag,
ensureLangSubtagIsIndivForReps,
ensureLangSubtagIsCanonicalForReps,
formatDialectCode,
getMaximalLangtag,
getShortestSufficientLangtag,
Expand All @@ -20,6 +22,21 @@ import {
ICustomizableLanguageDetails,
} from "./findLanguageInterfaces";
import { getRegionBySubtag } from "./regionsAndScripts";
import {
NORTHERN_UZBEK_LANGUAGE,
STANDARD_ARABIC_LANGUAGE,
ENGLISH_LANGUAGE,
NORWEGIAN_MACROLANGUAGE,
SERBO_CROATIAN_MACROLANGUAGE,
NORWEGIAN_BOKMAL_LANGUAGE,
NORWEGIAN_NYNORSK_LANGUAGE,
BOSNIAN_LANGUAGE,
MONTENEGRIN_LANGUAGE,
CROATIAN_LANGUAGE,
SERBIAN_LANGUAGE,
ARABIC_MACROLANGUAGE,
AYMARA_MACROLANGUAGE,
} from "./testUtils";

describe("Tag creation", () => {
it("should create the correct language tag for a language", () => {
Expand Down Expand Up @@ -56,10 +73,37 @@ describe("Tag creation", () => {
createTag({
dialectCode: "foobar",
scriptCode: "Latn",
regionCode: "US"
regionCode: "US",
})
).toEqual("qaa-Latn-US-x-foobar");
});

it("should shorten tags when appropriate", () => {
expect(
createTag({
languageCode: "en",
scriptCode: "Latn",
regionCode: "US",
})
).toEqual("en");
});

it("should shorten tags even when the language code is individual rather than canonical for a representative language", () => {
// uzn is the individual code for Northern Uzbek, which is the representative for the uz macrolanguage
expect(
createTag(
{
languageCode: "uzn",
scriptCode: "Latn",
regionCode: "UZ",
},
NORTHERN_UZBEK_LANGUAGE
)
// The canonical short tag is for Northern Uzbek, Latin Script, Uzbekistan is uz. Currently, createTag is a naive
// helper function which will return `uz` and then we need to call ensureLangSubtagIsIndivForReps to transform it
// to `uzn`
).toEqual("uz");
});
});

describe("get shortest equivalent version of langtag", () => {
Expand Down Expand Up @@ -145,6 +189,17 @@ describe("get shortest equivalent version of langtag", () => {
expect(getShortestSufficientLangtag("ta-Arab-PK")).toBeUndefined();
expect(getShortestSufficientLangtag("sr-Cyrl-RO")).toBeUndefined();
});

// Currently, getShortestSufficientLangtag will return canonical tags like `uz` and then we need to
// call ensureLangSubtagIsIndivForReps to make them use individual language subtags (like `uzn`) instead.
it("should handle representative languages with language parameter", () => {
expect(
getShortestSufficientLangtag("uzn-Latn-UZ", NORTHERN_UZBEK_LANGUAGE)
).toEqual("uz");
expect(
getShortestSufficientLangtag("uzn-Cyrl-x-foobar", NORTHERN_UZBEK_LANGUAGE)
).toEqual("uz-Cyrl-x-foobar");
});
});

describe("get maximal equivalent version of langtag", () => {
Expand All @@ -158,7 +213,7 @@ describe("get maximal equivalent version of langtag", () => {
createTag({
languageCode: "dtp",
regionCode: "MY",
scriptCode: "Latn"
scriptCode: "Latn",
})
)
).toEqual("dtp-Latn-MY");
Expand All @@ -181,6 +236,28 @@ describe("get maximal equivalent version of langtag", () => {
expect(getMaximalLangtag("")).toBeUndefined();
expect(getMaximalLangtag("frm-Cyrl")).toBeUndefined();
});

it("should handle representative languages with language parameter", () => {
// Currently, getMaximalLangtag will return canonical tags like `uz` and then we need to
// call ensureLangSubtagIsIndivForReps to make them use individual language subtags (like `uzn`) instead.
// But what we want to test is that getMaximalLangtag has found all the additional implied subtags
expect(getMaximalLangtag("uzn", NORTHERN_UZBEK_LANGUAGE)).toEqual(
"uz-Latn-UZ"
);
expect(getMaximalLangtag("uzn-Latn", NORTHERN_UZBEK_LANGUAGE)).toEqual(
"uz-Latn-UZ"
);
expect(getMaximalLangtag("uzn-Cyrl", NORTHERN_UZBEK_LANGUAGE)).toEqual(
"uz-Cyrl-UZ"
);
// ar (Standard Arabic) defaults to Arab script
expect(getMaximalLangtag("arb", STANDARD_ARABIC_LANGUAGE)).toEqual(
"ar-Arab-EG"
);
expect(getMaximalLangtag("arb-Arab", STANDARD_ARABIC_LANGUAGE)).toEqual(
"ar-Arab-EG"
);
});
});

describe("Tag parsing", () => {
Expand Down Expand Up @@ -238,6 +315,7 @@ describe("Tag parsing", () => {
).toEqual("Tibetan");

expect(parseLangtagFromLangChooser("ce")?.script?.name).toEqual("Cyrillic");
expect(parseLangtagFromLangChooser("uzn")?.script?.name).toEqual("Latin"); // even with an individual rather than canonical tag for a representative language
});

it("should put private use subtags into dialect field", () => {
Expand Down Expand Up @@ -329,6 +407,13 @@ describe("Tag parsing", () => {
parseLangtagFromLangChooser("en", modifier)?.language?.exonym
).toEqual(foobar);
});

it("should be able to find implied scripts even for representative langs with individual rather than canonical language subtags", () => {
// uzn is Northern Uzbek, representative for uz macrolanguage
const uznResult = parseLangtagFromLangChooser("uzn-x-foobar");
expect(uznResult?.language?.iso639_3_code).toEqual("uzn");
expect(uznResult?.script?.name).toEqual("Latin");
});
});

describe("defaultRegionForLangTag", () => {
Expand All @@ -350,6 +435,110 @@ describe("defaultRegionForLangTag", () => {
"Uzbekistan"
);
});

it("should handle representative languages with language parameter", () => {
// When passing uzn tag with language parameter, should use canonical uz for lookup
expect(
defaultRegionForLangTag("uzn", NORTHERN_UZBEK_LANGUAGE)?.name
).toEqual("Uzbekistan");
expect(
defaultRegionForLangTag("uzn-Sogd", NORTHERN_UZBEK_LANGUAGE)?.code
).toEqual("CN");
expect(
defaultRegionForLangTag("uzn-Latn-IN", NORTHERN_UZBEK_LANGUAGE)?.name
).toEqual("India");

expect(
defaultRegionForLangTag("arb", STANDARD_ARABIC_LANGUAGE)?.name
).toEqual("Egypt");
expect(
defaultRegionForLangTag("arb-Arab-x-foo", STANDARD_ARABIC_LANGUAGE)?.name
).toEqual("Egypt");
});
});

describe("ensureLangSubtagIsIndivForReps", () => {
it("should return the original tag when not representative", () => {
expect(ensureLangSubtagIsIndivForReps("en", undefined)).toEqual("en");
expect(ensureLangSubtagIsIndivForReps("en", ENGLISH_LANGUAGE)).toEqual(
"en"
);
expect(
ensureLangSubtagIsIndivForReps("en-Latn-US-x-foo", ENGLISH_LANGUAGE)
).toEqual("en-Latn-US-x-foo");
});

it("should replace the canonical macrolang subtag for representative languages", () => {
expect(
ensureLangSubtagIsIndivForReps("ar", STANDARD_ARABIC_LANGUAGE)
).toEqual("arb");
expect(
ensureLangSubtagIsIndivForReps(
"ar-Arab-EG-x-foo",
STANDARD_ARABIC_LANGUAGE
)
).toEqual("arb-Arab-EG-x-foo");

expect(
ensureLangSubtagIsIndivForReps("uz-Latn-UZ", NORTHERN_UZBEK_LANGUAGE)
).toEqual("uzn-Latn-UZ");
});
});

describe("ensureLangSubtagIsCanonicalForReps", () => {
it("should return the languageSubtag for non-representative languages", () => {
expect(ensureLangSubtagIsCanonicalForReps("eng", ENGLISH_LANGUAGE)).toEqual(
"en"
);
expect(
ensureLangSubtagIsCanonicalForReps("en-Latn-US", ENGLISH_LANGUAGE)
).toEqual("en-Latn-US");
});

it("should replace first subtag with canonical macrolanguage code for representative languages", () => {
expect(
ensureLangSubtagIsCanonicalForReps("arb", STANDARD_ARABIC_LANGUAGE)
).toEqual("ar");
expect(
ensureLangSubtagIsCanonicalForReps(
"arb-Arab-EG",
STANDARD_ARABIC_LANGUAGE
)
).toEqual("ar-Arab-EG");
expect(
ensureLangSubtagIsCanonicalForReps("uzn", NORTHERN_UZBEK_LANGUAGE)
).toEqual("uz");
expect(
ensureLangSubtagIsCanonicalForReps("uzn-Latn-UZ", NORTHERN_UZBEK_LANGUAGE)
).toEqual("uz-Latn-UZ");
});

it("should fallback to languageSubtag if alternativeTags is missing empty", () => {
const repLanguageNoAltTags = {
iso639_3_code: "arb",
languageSubtag: "arb",
isRepresentativeForMacrolanguage: true,
alternativeTags: [],
isMacrolanguage: false,
exonym: "Standard Arabic",
scripts: [{ code: "Arab", name: "Arabic" }],
regionNamesForDisplay: "",
regionNamesForSearch: [],
names: [],
languageType: LanguageType.Living,
} as ILanguage;
expect(
ensureLangSubtagIsCanonicalForReps("arb", repLanguageNoAltTags)
).toEqual("arb");
});
it("should preserve all other subtags when replacing first subtag", () => {
expect(
ensureLangSubtagIsCanonicalForReps(
"arb-Arab-EG-x-foo-x-bar",
STANDARD_ARABIC_LANGUAGE
)
).toEqual("ar-Arab-EG-x-foo-x-bar");
});
});

describe("createTagFromOrthography", () => {
Expand Down Expand Up @@ -440,6 +629,82 @@ describe("createTagFromOrthography", () => {
})
).toEqual("en-x-ai-newFancy");
});

it("should return indiv iso code when language is representative for a macrolanguage", () => {
expect(
createTagFromOrthography({ language: STANDARD_ARABIC_LANGUAGE })
).toEqual("arb");
expect(
createTagFromOrthography({
language: NORTHERN_UZBEK_LANGUAGE,
script: { code: "Cyrl", name: "Cyrillic" },
})
).toEqual("uzn-Cyrl");

expect(
createTagFromOrthography({
language: NORTHERN_UZBEK_LANGUAGE,
script: { code: "Latn", name: "Latin" },
})
).toEqual("uzn");

expect(
createTagFromOrthography({
language: STANDARD_ARABIC_LANGUAGE,
customDetails: {
region: { name: "Egypt", code: "EG" },
dialect: "foobar",
},
})
).toEqual("arb-x-foobar");
});

it("should create tags for macrolanguages with the preferred code", () => {
// Pure macrolanguages (without isRepresentativeForMacrolanguage) should use their language subtag
expect(
createTagFromOrthography({ language: ARABIC_MACROLANGUAGE })
).toEqual("ar");
expect(
createTagFromOrthography({ language: AYMARA_MACROLANGUAGE })
).toEqual("ay");
});

// See macrolanguages.md regarding the special cases. Note however that our desired behavior for Akan (aka/ak) and
// Sanskrit (san/sa) is achieved by the search result modifier (searchResultModifier.ts)
it("should handle special cases", () => {
// Bosnian should give "bs"
expect(createTagFromOrthography({ language: BOSNIAN_LANGUAGE })).toEqual(
"bs"
);
// Montenegrin should give "cnr"
expect(
createTagFromOrthography({ language: MONTENEGRIN_LANGUAGE })
).toEqual("cnr");
// Croatian should give "hr"
expect(createTagFromOrthography({ language: CROATIAN_LANGUAGE })).toEqual(
"hr"
);
// Serbian should give "sr"
expect(createTagFromOrthography({ language: SERBIAN_LANGUAGE })).toEqual(
"sr"
);
// Serbo-Croatian macrolanguage should use "sh"
expect(
createTagFromOrthography({ language: SERBO_CROATIAN_MACROLANGUAGE })
).toEqual("sh");
// Norwegian Bokmål should use "nb"
expect(
createTagFromOrthography({ language: NORWEGIAN_BOKMAL_LANGUAGE })
).toEqual("nb");
// Norwegian Nynorsk should use "nn"
expect(
createTagFromOrthography({ language: NORWEGIAN_NYNORSK_LANGUAGE })
).toEqual("nn");
// Norwegian macrolanguage should use "no"
expect(
createTagFromOrthography({ language: NORWEGIAN_MACROLANGUAGE })
).toEqual("no");
});
});

describe("isValidBcp47Tag checking is sane", () => {
Expand Down
Loading