sillsdev · andrew-polk · Feb 13, 2026 · Feb 13, 2026
diff --git a/.prettierrc b/.prettierrc
@@ -0,0 +1,3 @@
+{
+  "trailingComma": "es5"
+}
diff --git a/components/language-chooser/common/find-language/README.md b/components/language-chooser/common/find-language/README.md
@@ -194,11 +194,15 @@ If you modify [langtagProcessing.ts](./langtagProcessing.ts), run `npm run find-
 
 find-language searches languages included in the ISO-639-3 standard; every result returned will have a unique ISO-639-3 code. The entries listed in our source database, langtags.json, are combinations of languages, scripts, regions, and/or variants. [langtagProcessing.ts](./langtagProcessing.ts) consolidates these entries by their ISO-639-3 code and saves the result to [languageData.json](language-data/languageData.json) for searching. For example, langtags.json has separate entries for Abhaz with Cyrillic script, Abhaz with Georgian script, and Abhaz with Latin script. langtagProcessing.ts will combine these into a single entry which lists all three possible scripts and has the superset of the names, regions, etc. of the three entries from langtags.json. This way the search results will contain at most one entry for the language Abhaz.
 
+[langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt) lists equivalent language tags, and langtagProcessing.ts reformats it into [equivalentTags.json](language-data/equivalentTags.json) which we use for mapping language tags to their shortest and maximal equivalents.
+
+**Note: In this package we use individual language tags instead of the macrolanguage tags for individual languages, even if it is common/"canonical" to use a macrolangauge tag for that individual language. See [macrolanguageNotes.md](macrolanguageNotes.md) for details. So in [languageData.json](language-data/languageData.json), the the `languageSubtag` field will always be a specifically individual language code for individual languages.** (There are a few exceptional cases, see [macrolanguageNotes.md](macrolanguageNotes.md).) **However, equivalentTags.json generally contains tags in their "canonical" form.** Use utilities in [languageTagUtils.ts](./languageTagUtils.ts) to convert between "canonical" and specifically individual language tags.
+
 #### Language tag shortening
 
-The [createTag](./languageTagUtils.ts) function in this package will return the shortest (and thus preferred) tag for a given language/script/region/dialect combination. For example, given language code "emm" (Mamulique), script code "Latn" (Latin) and region code "MX" (Mexico), `createTag` will return "emm" because it is the preferred equivalent tag for emm-Latn-MX.
+The [createTagFromOrthography](./languageTagUtils.ts) function in this package will return the shortest (and thus preferred) tag for a given language/script/region/dialect combination. For example, given language code "emm" (Mamulique), script code "Latn" (Latin) and region code "MX" (Mexico), `createTagFromOrthography` will return "emm" because it is the preferred equivalent tag for emm-Latn-MX.
 
-[langtags.txt](https://github.com/silnrsi/langtags/blob/master/doc/tagging.md#langtagstxt) lists equivalent language tags. langtagProcessing.ts reformats it into [equivalentTags.json](language-data/equivalentTags.json) which we use for mapping language tags to their shortest and maximal equivalents.
+For languages that are representative for a macrolanguage, however, we use the individual language's ISO 639-3 code as the language subtag even if the macrolanguage code is considered the preferred equivalent tag for this language. See [macrolanguageNotes.md](macrolanguageNotes.md) for further explanation.
 
 ## Data sources
 

diff --git a/components/language-chooser/common/find-language/findLanguageInterfaces.ts b/components/language-chooser/common/find-language/findLanguageInterfaces.ts
@@ -24,7 +24,7 @@ export interface ILanguage {
   autonym?: string;
   exonym: string;
   iso639_3_code: string; // ISO 639-3 code
-  languageSubtag: string; // BCP-47 canonical code
+  languageSubtag: string; // BCP-47 canonical code, except for representative languages where the canonical code is actually the macrolanguage code, in which case we use a specifically individual language code instead
   regionNamesForDisplay: string; // For macrolanguages, we display a region but don't want the macrolanguage to come up in searches for that region
   regionNamesForSearch: string[];
   names: string[];

diff --git a/components/language-chooser/common/find-language/language-data/languageData.json b/components/language-chooser/common/find-language/language-data/languageData.json
diff --git a/components/language-chooser/common/find-language/languageSearch.spec.ts b/components/language-chooser/common/find-language/languageSearch.spec.ts
@@ -519,3 +519,21 @@ describe("other language object types", async () => {
     expect(allAsyncResults[1].id).toEqual("second_eng_result");
   });
 });
+
+// The first entry of alternativeTags should be a canonical tag for some form of the language, and so its subtag should
+// be canonical for use in ensureLangSubtagIsCanonicalForReps
+describe("canonical tag is first in alternativeTags", () => {
+  it("should have the canonical tag first in alternativeTags", () => {
+    const uzn = getLanguageBySubtag("uzn");
+    expect(uzn).toBeDefined();
+    expect(uzn?.alternativeTags[0].split("-")[0]).toBe("uz");
+
+    const ave = getLanguageBySubtag("ave");
+    expect(ave).toBeDefined();
+    expect(ave?.alternativeTags[0].split("-")[0]).toBe("ae");
+
+    const ojg = getLanguageBySubtag("ojg");
+    expect(ojg).toBeDefined();
+    expect(ojg?.alternativeTags[0].split("-")[0]).toBe("oj");
+  });
+});
diff --git a/components/language-chooser/common/find-language/languageTagUtils.spec.ts b/components/language-chooser/common/find-language/languageTagUtils.spec.ts
@@ -3,6 +3,8 @@ import {
   createTag,
   createTagFromOrthography,
   defaultRegionForLangTag,
+  ensureLangSubtagIsIndivForReps,
+  ensureLangSubtagIsCanonicalForReps,
   formatDialectCode,
   getMaximalLangtag,
   getShortestSufficientLangtag,
@@ -20,6 +22,21 @@ import {
   ICustomizableLanguageDetails,
 } from "./findLanguageInterfaces";
 import { getRegionBySubtag } from "./regionsAndScripts";
+import {
+  NORTHERN_UZBEK_LANGUAGE,
+  STANDARD_ARABIC_LANGUAGE,
+  ENGLISH_LANGUAGE,
+  NORWEGIAN_MACROLANGUAGE,
+  SERBO_CROATIAN_MACROLANGUAGE,
+  NORWEGIAN_BOKMAL_LANGUAGE,
+  NORWEGIAN_NYNORSK_LANGUAGE,
+  BOSNIAN_LANGUAGE,
+  MONTENEGRIN_LANGUAGE,
+  CROATIAN_LANGUAGE,
+  SERBIAN_LANGUAGE,
+  ARABIC_MACROLANGUAGE,
+  AYMARA_MACROLANGUAGE,
+} from "./testUtils";
 
 describe("Tag creation", () => {
   it("should create the correct language tag for a language", () => {
@@ -56,10 +73,37 @@ describe("Tag creation", () => {
       createTag({
         dialectCode: "foobar",
         scriptCode: "Latn",
-        regionCode: "US"
+        regionCode: "US",
       })
     ).toEqual("qaa-Latn-US-x-foobar");
   });
+
+  it("should shorten tags when appropriate", () => {
+    expect(
+      createTag({
+        languageCode: "en",
+        scriptCode: "Latn",
+        regionCode: "US",
+      })
+    ).toEqual("en");
+  });
+
+  it("should shorten tags even when the language code is individual rather than canonical for a representative language", () => {
+    // uzn is the individual code for Northern Uzbek, which is the representative for the uz macrolanguage
+    expect(
+      createTag(
+        {
+          languageCode: "uzn",
+          scriptCode: "Latn",
+          regionCode: "UZ",
+        },
+        NORTHERN_UZBEK_LANGUAGE
+      )
+      // The canonical short tag is for Northern Uzbek, Latin Script, Uzbekistan is uz. Currently, createTag is a naive
+      // helper function which will return `uz` and then we need to call ensureLangSubtagIsIndivForReps to transform it
+      // to `uzn`
+    ).toEqual("uz");
+  });
 });
 
 describe("get shortest equivalent version of langtag", () => {
@@ -145,6 +189,17 @@ describe("get shortest equivalent version of langtag", () => {
     expect(getShortestSufficientLangtag("ta-Arab-PK")).toBeUndefined();
     expect(getShortestSufficientLangtag("sr-Cyrl-RO")).toBeUndefined();
   });
+
+  // Currently, getShortestSufficientLangtag will return canonical tags like `uz` and then we need to
+  // call ensureLangSubtagIsIndivForReps to make them use individual language subtags (like `uzn`) instead.
+  it("should handle representative languages with language parameter", () => {
+    expect(
+      getShortestSufficientLangtag("uzn-Latn-UZ", NORTHERN_UZBEK_LANGUAGE)
+    ).toEqual("uz");
+    expect(
+      getShortestSufficientLangtag("uzn-Cyrl-x-foobar", NORTHERN_UZBEK_LANGUAGE)
+    ).toEqual("uz-Cyrl-x-foobar");
+  });
 });
 
 describe("get maximal equivalent version of langtag", () => {
@@ -158,7 +213,7 @@ describe("get maximal equivalent version of langtag", () => {
         createTag({
           languageCode: "dtp",
           regionCode: "MY",
-          scriptCode: "Latn"
+          scriptCode: "Latn",
         })
       )
     ).toEqual("dtp-Latn-MY");
@@ -181,6 +236,28 @@ describe("get maximal equivalent version of langtag", () => {
     expect(getMaximalLangtag("")).toBeUndefined();
     expect(getMaximalLangtag("frm-Cyrl")).toBeUndefined();
   });
+
+  it("should handle representative languages with language parameter", () => {
+    // Currently, getMaximalLangtag will return canonical tags like `uz` and then we need to
+    // call ensureLangSubtagIsIndivForReps to make them use individual language subtags (like `uzn`) instead.
+    // But what we want to test is that getMaximalLangtag has found all the additional implied subtags
+    expect(getMaximalLangtag("uzn", NORTHERN_UZBEK_LANGUAGE)).toEqual(
+      "uz-Latn-UZ"
+    );
+    expect(getMaximalLangtag("uzn-Latn", NORTHERN_UZBEK_LANGUAGE)).toEqual(
+      "uz-Latn-UZ"
+    );
+    expect(getMaximalLangtag("uzn-Cyrl", NORTHERN_UZBEK_LANGUAGE)).toEqual(
+      "uz-Cyrl-UZ"
+    );
+    // ar (Standard Arabic) defaults to Arab script
+    expect(getMaximalLangtag("arb", STANDARD_ARABIC_LANGUAGE)).toEqual(
+      "ar-Arab-EG"
+    );
+    expect(getMaximalLangtag("arb-Arab", STANDARD_ARABIC_LANGUAGE)).toEqual(
+      "ar-Arab-EG"
+    );
+  });
 });
 
 describe("Tag parsing", () => {
@@ -238,6 +315,7 @@ describe("Tag parsing", () => {
     ).toEqual("Tibetan");
 
     expect(parseLangtagFromLangChooser("ce")?.script?.name).toEqual("Cyrillic");
+    expect(parseLangtagFromLangChooser("uzn")?.script?.name).toEqual("Latin"); // even with an individual rather than canonical tag for a representative language
   });
 
   it("should put private use subtags into dialect field", () => {
@@ -329,6 +407,13 @@ describe("Tag parsing", () => {
       parseLangtagFromLangChooser("en", modifier)?.language?.exonym
     ).toEqual(foobar);
   });
+
+  it("should be able to find implied scripts even for representative langs with individual rather than canonical language subtags", () => {
+    // uzn is Northern Uzbek, representative for uz macrolanguage
+    const uznResult = parseLangtagFromLangChooser("uzn-x-foobar");
+    expect(uznResult?.language?.iso639_3_code).toEqual("uzn");
+    expect(uznResult?.script?.name).toEqual("Latin");
+  });
 });
 
 describe("defaultRegionForLangTag", () => {
@@ -350,6 +435,110 @@ describe("defaultRegionForLangTag", () => {
       "Uzbekistan"
     );
   });
+
+  it("should handle representative languages with language parameter", () => {
+    // When passing uzn tag with language parameter, should use canonical uz for lookup
+    expect(
+      defaultRegionForLangTag("uzn", NORTHERN_UZBEK_LANGUAGE)?.name
+    ).toEqual("Uzbekistan");
+    expect(
+      defaultRegionForLangTag("uzn-Sogd", NORTHERN_UZBEK_LANGUAGE)?.code
+    ).toEqual("CN");
+    expect(
+      defaultRegionForLangTag("uzn-Latn-IN", NORTHERN_UZBEK_LANGUAGE)?.name
+    ).toEqual("India");
+
+    expect(
+      defaultRegionForLangTag("arb", STANDARD_ARABIC_LANGUAGE)?.name
+    ).toEqual("Egypt");
+    expect(
+      defaultRegionForLangTag("arb-Arab-x-foo", STANDARD_ARABIC_LANGUAGE)?.name
+    ).toEqual("Egypt");
+  });
+});
+
+describe("ensureLangSubtagIsIndivForReps", () => {
+  it("should return the original tag when not representative", () => {
+    expect(ensureLangSubtagIsIndivForReps("en", undefined)).toEqual("en");
+    expect(ensureLangSubtagIsIndivForReps("en", ENGLISH_LANGUAGE)).toEqual(
+      "en"
+    );
+    expect(
+      ensureLangSubtagIsIndivForReps("en-Latn-US-x-foo", ENGLISH_LANGUAGE)
+    ).toEqual("en-Latn-US-x-foo");
+  });
+
+  it("should replace the canonical macrolang subtag for representative languages", () => {
+    expect(
+      ensureLangSubtagIsIndivForReps("ar", STANDARD_ARABIC_LANGUAGE)
+    ).toEqual("arb");
+    expect(
+      ensureLangSubtagIsIndivForReps(
+        "ar-Arab-EG-x-foo",
+        STANDARD_ARABIC_LANGUAGE
+      )
+    ).toEqual("arb-Arab-EG-x-foo");
+
+    expect(
+      ensureLangSubtagIsIndivForReps("uz-Latn-UZ", NORTHERN_UZBEK_LANGUAGE)
+    ).toEqual("uzn-Latn-UZ");
+  });
+});
+
+describe("ensureLangSubtagIsCanonicalForReps", () => {
+  it("should return the languageSubtag for non-representative languages", () => {
+    expect(ensureLangSubtagIsCanonicalForReps("eng", ENGLISH_LANGUAGE)).toEqual(
+      "en"
+    );
+    expect(
+      ensureLangSubtagIsCanonicalForReps("en-Latn-US", ENGLISH_LANGUAGE)
+    ).toEqual("en-Latn-US");
+  });
+
+  it("should replace first subtag with canonical macrolanguage code for representative languages", () => {
+    expect(
+      ensureLangSubtagIsCanonicalForReps("arb", STANDARD_ARABIC_LANGUAGE)
+    ).toEqual("ar");
+    expect(
+      ensureLangSubtagIsCanonicalForReps(
+        "arb-Arab-EG",
+        STANDARD_ARABIC_LANGUAGE
+      )
+    ).toEqual("ar-Arab-EG");
+    expect(
+      ensureLangSubtagIsCanonicalForReps("uzn", NORTHERN_UZBEK_LANGUAGE)
+    ).toEqual("uz");
+    expect(
+      ensureLangSubtagIsCanonicalForReps("uzn-Latn-UZ", NORTHERN_UZBEK_LANGUAGE)
+    ).toEqual("uz-Latn-UZ");
+  });
+
+  it("should fallback to languageSubtag if alternativeTags is missing empty", () => {
+    const repLanguageNoAltTags = {
+      iso639_3_code: "arb",
+      languageSubtag: "arb",
+      isRepresentativeForMacrolanguage: true,
+      alternativeTags: [],
+      isMacrolanguage: false,
+      exonym: "Standard Arabic",
+      scripts: [{ code: "Arab", name: "Arabic" }],
+      regionNamesForDisplay: "",
+      regionNamesForSearch: [],
+      names: [],
+      languageType: LanguageType.Living,
+    } as ILanguage;
+    expect(
+      ensureLangSubtagIsCanonicalForReps("arb", repLanguageNoAltTags)
+    ).toEqual("arb");
+  });
+  it("should preserve all other subtags when replacing first subtag", () => {
+    expect(
+      ensureLangSubtagIsCanonicalForReps(
+        "arb-Arab-EG-x-foo-x-bar",
+        STANDARD_ARABIC_LANGUAGE
+      )
+    ).toEqual("ar-Arab-EG-x-foo-x-bar");
+  });
 });
 
 describe("createTagFromOrthography", () => {
@@ -440,6 +629,82 @@ describe("createTagFromOrthography", () => {
       })
     ).toEqual("en-x-ai-newFancy");
   });
+
+  it("should return indiv iso code when language is representative for a macrolanguage", () => {
+    expect(
+      createTagFromOrthography({ language: STANDARD_ARABIC_LANGUAGE })
+    ).toEqual("arb");
+    expect(
+      createTagFromOrthography({
+        language: NORTHERN_UZBEK_LANGUAGE,
+        script: { code: "Cyrl", name: "Cyrillic" },
+      })
+    ).toEqual("uzn-Cyrl");
+
+    expect(
+      createTagFromOrthography({
+        language: NORTHERN_UZBEK_LANGUAGE,
+        script: { code: "Latn", name: "Latin" },
+      })
+    ).toEqual("uzn");
+
+    expect(
+      createTagFromOrthography({
+        language: STANDARD_ARABIC_LANGUAGE,
+        customDetails: {
+          region: { name: "Egypt", code: "EG" },
+          dialect: "foobar",
+        },
+      })
+    ).toEqual("arb-x-foobar");
+  });
+
+  it("should create tags for macrolanguages with the preferred code", () => {
+    // Pure macrolanguages (without isRepresentativeForMacrolanguage) should use their language subtag
+    expect(
+      createTagFromOrthography({ language: ARABIC_MACROLANGUAGE })
+    ).toEqual("ar");
+    expect(
+      createTagFromOrthography({ language: AYMARA_MACROLANGUAGE })
+    ).toEqual("ay");
+  });
+
+  // See macrolanguages.md regarding the special cases. Note however that our desired behavior for Akan (aka/ak) and
+  // Sanskrit (san/sa) is achieved by the search result modifier (searchResultModifier.ts)
+  it("should handle special cases", () => {
+    // Bosnian should give "bs"
+    expect(createTagFromOrthography({ language: BOSNIAN_LANGUAGE })).toEqual(
+      "bs"
+    );
+    // Montenegrin should give "cnr"
+    expect(
+      createTagFromOrthography({ language: MONTENEGRIN_LANGUAGE })
+    ).toEqual("cnr");
+    // Croatian should give "hr"
+    expect(createTagFromOrthography({ language: CROATIAN_LANGUAGE })).toEqual(
+      "hr"
+    );
+    // Serbian should give "sr"
+    expect(createTagFromOrthography({ language: SERBIAN_LANGUAGE })).toEqual(
+      "sr"
+    );
+    // Serbo-Croatian macrolanguage should use "sh"
+    expect(
+      createTagFromOrthography({ language: SERBO_CROATIAN_MACROLANGUAGE })
+    ).toEqual("sh");
+    // Norwegian Bokmål should use "nb"
+    expect(
+      createTagFromOrthography({ language: NORWEGIAN_BOKMAL_LANGUAGE })
+    ).toEqual("nb");
+    // Norwegian Nynorsk should use "nn"
+    expect(
+      createTagFromOrthography({ language: NORWEGIAN_NYNORSK_LANGUAGE })
+    ).toEqual("nn");
+    // Norwegian macrolanguage should use "no"
+    expect(
+      createTagFromOrthography({ language: NORWEGIAN_MACROLANGUAGE })
+    ).toEqual("no");
+  });
 });
 
 describe("isValidBcp47Tag checking is sane", () => {