From bb18120ddbfa5f9d7d9f406de325e319e664d63c Mon Sep 17 00:00:00 2001
From: Ilya Simpson <elliot@p-s.co.nz>
Date: Tue, 6 May 2025 20:18:06 +1200
Subject: [PATCH] Denote Markdown code blocks as Python.

---
 README.md | 468 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 272 insertions(+), 196 deletions(-)

diff --git a/README.md b/README.md
index 2c1af732..fa9c3356 100644
--- a/README.md
+++ b/README.md
@@ -82,52 +82,66 @@ This function standardizes tags, as strings, in several ways.
 It replaces overlong tags with their shortest version, and also formats them
 according to the conventions of BCP 47:
 
-    >>> from langcodes import *
-    >>> standardize_tag('eng_US')
-    'en-US'
+```python
+>>> from langcodes import *
+>>> standardize_tag('eng_US')
+'en-US'
+```
 
 It removes script subtags that are redundant with the language:
 
-    >>> standardize_tag('en-Latn')
-    'en'
+```python
+>>> standardize_tag('en-Latn')
+'en'
+```
 
 It replaces deprecated values with their correct versions, if possible:
 
-    >>> standardize_tag('en-uk')
-    'en-GB'
+```python
+>>> standardize_tag('en-uk')
+'en-GB'
+```
 
 Sometimes this involves complex substitutions, such as replacing Serbo-Croatian
 (`sh`) with Serbian in Latin script (`sr-Latn`), or the entire tag `sgn-US`
 with `ase` (American Sign Language).
 
-    >>> standardize_tag('sh-QU')
-    'sr-Latn-EU'
+```python
+>>> standardize_tag('sh-QU')
+'sr-Latn-EU'
 
-    >>> standardize_tag('sgn-US')
-    'ase'
+>>> standardize_tag('sgn-US')
+'ase'
+```
 
 If *macro* is True, it uses macrolanguage codes as a replacement for the most
 common standardized language within that macrolanguage.
 
-    >>> standardize_tag('arb-Arab', macro=True)
-    'ar'
+```python
+>>> standardize_tag('arb-Arab', macro=True)
+'ar'
+```
 
 Even when *macro* is False, it shortens tags that contain both the
 macrolanguage and the language:
 
-    >>> standardize_tag('zh-cmn-hans-cn')
-    'zh-Hans-CN'
+```python
+>>> standardize_tag('zh-cmn-hans-cn')
+'zh-Hans-CN'
+```
 
 If the tag can't be parsed according to BCP 47, this will raise a
 LanguageTagError (a subclass of ValueError):
 
-    >>> standardize_tag('spa-latn-mx')
-    'es-MX'
+```python
+>>> standardize_tag('spa-latn-mx')
+'es-MX'
 
-    >>> standardize_tag('spa-mx-latn')
-    Traceback (most recent call last):
-        ...
-    langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
+>>> standardize_tag('spa-mx-latn')
+Traceback (most recent call last):
+    ...
+langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
+```
 
 
 ## Language objects
@@ -157,34 +171,40 @@ By default, it will replace non-standard and overlong tags as it interprets
 them. To disable this feature and get the codes that literally appear in the
 language tag, use the *normalize=False* option.
 
-    >>> Language.get('en-Latn-US')
-    Language.make(language='en', script='Latn', territory='US')
+```python
+>>> Language.get('en-Latn-US')
+Language.make(language='en', script='Latn', territory='US')
 
-    >>> Language.get('sgn-US', normalize=False)
-    Language.make(language='sgn', territory='US')
+>>> Language.get('sgn-US', normalize=False)
+Language.make(language='sgn', territory='US')
 
-    >>> Language.get('und')
-    Language.make()
+>>> Language.get('und')
+Language.make()
+```
 
 Here are some examples of replacing non-standard tags:
 
-    >>> Language.get('sh-QU')
-    Language.make(language='sr', script='Latn', territory='EU')
+```python
+>>> Language.get('sh-QU')
+Language.make(language='sr', script='Latn', territory='EU')
 
-    >>> Language.get('sgn-US')
-    Language.make(language='ase')
+>>> Language.get('sgn-US')
+Language.make(language='ase')
 
-    >>> Language.get('zh-cmn-Hant')
-    Language.make(language='zh', script='Hant')
+>>> Language.get('zh-cmn-Hant')
+Language.make(language='zh', script='Hant')
+```
 
 Use the `str()` function on a Language object to convert it back to its
 standard string form:
 
-    >>> str(Language.get('sh-QU'))
-    'sr-Latn-EU'
+```python
+>>> str(Language.get('sh-QU'))
+'sr-Latn-EU'
 
-    >>> str(Language.make(territory='IN'))
-    'und-IN'
+>>> str(Language.make(territory='IN'))
+'und-IN'
+```
 
 
 ### Checking validity
@@ -197,50 +217,66 @@ validity. We don't check other parts such as extlangs or Unicode extensions.
 
 For example, `ja` is a valid language code, and `jp` is not:
 
-    >>> Language.get('ja').is_valid()
-    True
+```python
+>>> Language.get('ja').is_valid()
+True
 
-    >>> Language.get('jp').is_valid()
-    False
+>>> Language.get('jp').is_valid()
+False
+```
 
 The top-level function `tag_is_valid(tag)` is possibly more convenient to use,
 because it can return False even for tags that don't parse:
 
-    >>> tag_is_valid('C')
-    False
+```python
+>>> tag_is_valid('C')
+False
+```
 
 If one subtag is invalid, the entire code is invalid:
 
-    >>> tag_is_valid('en-000')
-    False
+```python
+>>> tag_is_valid('en-000')
+False
+```
 
 `iw` is valid, though it's a deprecated alias for `he`:
 
-    >>> tag_is_valid('iw')
-    True
+```python
+>>> tag_is_valid('iw')
+True
+```
 
 The empty language tag (`und`) is valid:
 
-    >>> tag_is_valid('und')
-    True
+```python
+>>> tag_is_valid('und')
+True
+```
 
 Private use codes are valid:
 
-    >>> tag_is_valid('x-other')
-    True
+```python
+>>> tag_is_valid('x-other')
+True
 
-    >>> tag_is_valid('qaa-Qaai-AA-x-what-even-is-this')
-    True
+>>> tag_is_valid('qaa-Qaai-AA-x-what-even-is-this')
+True
+```
 
 Language tags that are very unlikely are still valid:
 
-    >>> tag_is_valid('fr-Cyrl')
-    True
+```python
+>>> tag_is_valid('fr-Cyrl')
+True
+```
 
 Tags with non-ASCII characters are invalid, because they don't parse:
 
-    >>> tag_is_valid('zh-普通话')
-    False
+```python
+>>> tag_is_valid('zh-普通话')
+False
+```
 
 
 ### Getting alpha3 codes
@@ -266,33 +302,39 @@ bibliographic code.
 
 When this method returns, it always returns a 3-letter string.
 
-    >>> Language.get('fr').to_alpha3()
-    'fra'
-    >>> Language.get('fr-CA').to_alpha3()
-    'fra'
-    >>> Language.get('fr-CA').to_alpha3(variant='B')
-    'fre'
-    >>> Language.get('de').to_alpha3()
-    'deu'
-    >>> Language.get('no').to_alpha3()
-    'nor'
-    >>> Language.get('un').to_alpha3()
-    Traceback (most recent call last):
-        ...
-    LookupError: 'un' is not a known language code, and has no alpha3 code.
+```python
+>>> Language.get('fr').to_alpha3()
+'fra'
+>>> Language.get('fr-CA').to_alpha3()
+'fra'
+>>> Language.get('fr-CA').to_alpha3(variant='B')
+'fre'
+>>> Language.get('de').to_alpha3()
+'deu'
+>>> Language.get('no').to_alpha3()
+'nor'
+>>> Language.get('un').to_alpha3()
+Traceback (most recent call last):
+    ...
+LookupError: 'un' is not a known language code, and has no alpha3 code.
+```
 
 For many languages, the terminology and bibliographic alpha3 codes are the same.
 
-    >>> Language.get('en').to_alpha3(variant='T')
-    'eng'
-    >>> Language.get('en').to_alpha3(variant='B')
-    'eng'
+```python
+>>> Language.get('en').to_alpha3(variant='T')
+'eng'
+>>> Language.get('en').to_alpha3(variant='B')
+'eng'
+```
 
 When you use any of these "overlong" alpha3 codes in langcodes, they normalize
 back to the alpha2 code:
 
-    >>> Language.get('zho')
-    Language.make(language='zh')
+```python
+>>> Language.get('zho')
+Language.make(language='zh')
+```
 
 
 ## Working with language names
@@ -315,77 +357,89 @@ English, plus CLDR, which names languages in many commonly-used languages.
 
 The default language for naming things is English:
 
-    >>> Language.make(language='fr').display_name()
-    'French'
+```python
+>>> Language.make(language='fr').display_name()
+'French'
 
-    >>> Language.make().display_name()
-    'Unknown language'
+>>> Language.make().display_name()
+'Unknown language'
 
-    >>> Language.get('zh-Hans').display_name()
-    'Chinese (Simplified)'
+>>> Language.get('zh-Hans').display_name()
+'Chinese (Simplified)'
 
-    >>> Language.get('en-US').display_name()
-    'English (United States)'
+>>> Language.get('en-US').display_name()
+'English (United States)'
+```
 
 But you can ask for language names in numerous other languages:
 
-    >>> Language.get('fr').display_name('fr')
-    'français'
+```python
+>>> Language.get('fr').display_name('fr')
+'français'
 
-    >>> Language.get('fr').display_name('es')
-    'francés'
+>>> Language.get('fr').display_name('es')
+'francés'
 
-    >>> Language.make().display_name('es')
-    'lengua desconocida'
+>>> Language.make().display_name('es')
+'lengua desconocida'
 
-    >>> Language.get('zh-Hans').display_name('de')
-    'Chinesisch (Vereinfacht)'
+>>> Language.get('zh-Hans').display_name('de')
+'Chinesisch (Vereinfacht)'
 
-    >>> Language.get('en-US').display_name('zh-Hans')
-    '英语（美国）'
+>>> Language.get('en-US').display_name('zh-Hans')
+'英语（美国）'
+```
 
 Why does everyone get Slovak and Slovenian confused? Let's ask them.
 
-    >>> Language.get('sl').display_name('sl')
-    'slovenščina'
-    >>> Language.get('sk').display_name('sk')
-    'slovenčina'
-    >>> Language.get('sl').display_name('sk')
-    'slovinčina'
-    >>> Language.get('sk').display_name('sl')
-    'slovaščina'
+```python
+>>> Language.get('sl').display_name('sl')
+'slovenščina'
+>>> Language.get('sk').display_name('sk')
+'slovenčina'
+>>> Language.get('sl').display_name('sk')
+'slovinčina'
+>>> Language.get('sk').display_name('sl')
+'slovaščina'
+```
 
 If the language has a script or territory code attached to it, these will be
 described in parentheses:
 
-    >>> Language.get('en-US').display_name()
-    'English (United States)'
+```python
+>>> Language.get('en-US').display_name()
+'English (United States)'
+```
 
 Sometimes these can be the result of tag normalization, such as in this case
 where the legacy tag 'sh' becomes 'sr-Latn':
 
-    >>> Language.get('sh').display_name()
-    'Serbian (Latin)'
+```python
+>>> Language.get('sh').display_name()
+'Serbian (Latin)'
 
-    >>> Language.get('sh', normalize=False).display_name()
-    'Serbo-Croatian'
+>>> Language.get('sh', normalize=False).display_name()
+'Serbo-Croatian'
+```
 
 Naming a language in itself is sometimes a useful thing to do, so the
 `.autonym()` method makes this easy, providing the display name of a language
 in the language itself:
 
-    >>> Language.get('fr').autonym()
-    'français'
-    >>> Language.get('es').autonym()
-    'español'
-    >>> Language.get('ja').autonym()
-    '日本語'
-    >>> Language.get('en-AU').autonym()
-    'English (Australia)'
-    >>> Language.get('sr-Latn').autonym()
-    'srpski (latinica)'
-    >>> Language.get('sr-Cyrl').autonym()
-    'српски (ћирилица)'
+```python
+>>> Language.get('fr').autonym()
+'français'
+>>> Language.get('es').autonym()
+'español'
+>>> Language.get('ja').autonym()
+'日本語'
+>>> Language.get('en-AU').autonym()
+'English (Australia)'
+>>> Language.get('sr-Latn').autonym()
+'srpski (latinica)'
+>>> Language.get('sr-Cyrl').autonym()
+'српски (ћирилица)'
+```
 
 The names come from the Unicode CLDR data files, and in English they can
 also come from the IANA language subtag registry. Together, they can give
@@ -399,12 +453,14 @@ You can get the parts of the name separately with the methods `.language_name()`
 that are present using the `.describe()` method. These methods also accept a
 language code for what language they should be described in.
 
-    >>> shaw = Language.get('en-Shaw-GB')
-    >>> shaw.describe('en')
-    {'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}
+```python
+>>> shaw = Language.get('en-Shaw-GB')
+>>> shaw.describe('en')
+{'language': 'English', 'script': 'Shavian', 'territory': 'United Kingdom'}
 
-    >>> shaw.describe('es')
-    {'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'}
+>>> shaw.describe('es')
+{'language': 'inglés', 'script': 'shaviano', 'territory': 'Reino Unido'}
+```
 
 
 ### Recognizing language names in natural language
@@ -415,18 +471,22 @@ its name, converting a natural language name such as "French" to a code such as
 
 The name can be in any language that CLDR supports (see "Ambiguity" below).
 
-    >>> import langcodes
-    >>> langcodes.find('french')
-    Language.make(language='fr')
+```python
+>>> import langcodes
+>>> langcodes.find('french')
+Language.make(language='fr')
 
-    >>> langcodes.find('francés')
-    Language.make(language='fr')
+>>> langcodes.find('francés')
+Language.make(language='fr')
+```
 
 However, this method currently ignores the parenthetical expressions that come from
 `.display_name()`:
 
-    >>> langcodes.find('English (Canada)')
-    Language.make(language='en')
+```python
+>>> langcodes.find('English (Canada)')
+Language.make(language='en')
+```
 
 There is still room to improve the way that language names are matched, because
 some languages are not consistently named the same way. The method currently
@@ -448,8 +508,10 @@ For example, no matter whether you decide "Tagalog" refers to the language code
 `tl` or the largely overlapping code `fil`, that distinction doesn't depend on
 the language you're saying "Tagalog" in. We can just return `tl` consistently.
 
-    >>> langcodes.find('tagalog')
-    Language.make(language='tl')
+```python
+>>> langcodes.find('tagalog')
+Language.make(language='tl')
+```
 
 In the few cases of actual interlingual ambiguity, langcodes won't match a result.
 You can pass in a `language=` parameter to say what language the name is in.
@@ -458,22 +520,24 @@ For example, there are two distinct languages called "Tonga" in various language
 They are `to`, the language of Tonga which is called "Tongan" in English; and `tog`,
 a language of Malawi that can be called "Nyasa Tonga" in English.
 
-    >>> langcodes.find('tongan')
-    Language.make(language='to')
+```python
+>>> langcodes.find('tongan')
+Language.make(language='to')
 
-    >>> langcodes.find('nyasa tonga')
-    Language.make(language='tog')
+>>> langcodes.find('nyasa tonga')
+Language.make(language='tog')
 
-    >>> langcodes.find('tonga')
-    Traceback (most recent call last):
-    ...
-    LookupError: Can't find any language named 'tonga'
+>>> langcodes.find('tonga')
+Traceback (most recent call last):
+...
+LookupError: Can't find any language named 'tonga'
 
-    >>> langcodes.find('tonga', language='id')
-    Language.make(language='to')
+>>> langcodes.find('tonga', language='id')
+Language.make(language='to')
 
-    >>> langcodes.find('tonga', language='ca')
-    Language.make(language='tog')
+>>> langcodes.find('tonga', language='ca')
+Language.make(language='tog')
+```
 
 Other ambiguous names written in Latin letters are "Kiga", "Mbundu", "Roman", and "Ruanda".
 
@@ -491,40 +555,46 @@ package to be installed.
 be limited to a particular territory with a territory code (such as a country
 code).
 
-    >>> Language.get('es').speaking_population()
-    493528077
+```python
+>>> Language.get('es').speaking_population()
+493528077
 
-    >>> Language.get('pt').speaking_population()
-    237496885
+>>> Language.get('pt').speaking_population()
+237496885
 
-    >>> Language.get('es-BR').speaking_population()
-    76218
+>>> Language.get('es-BR').speaking_population()
+76218
 
-    >>> Language.get('pt-BR').speaking_population()
-    192661560
+>>> Language.get('pt-BR').speaking_population()
+192661560
 
-    >>> Language.get('vo').speaking_population()
-    0
+>>> Language.get('vo').speaking_population()
+0
+```
 
 Script codes will be ignored, because the script is not involved in speaking:
 
-    >>> Language.get('es-Hant').speaking_population() ==\
-    ... Language.get('es').speaking_population()
-    True
+```python
+>>> Language.get('es-Hant').speaking_population() ==\
+... Language.get('es').speaking_population()
+True
+```
 
 `.writing_population()` estimates how many people write a language.
-        
-    >>> all = Language.get('zh').writing_population()
-    >>> all
-    1240841517
 
-    >>> traditional = Language.get('zh-Hant').writing_population()
-    >>> traditional
-    36863340
+```python
+>>> all = Language.get('zh').writing_population()
+>>> all
+1240841517
+
+>>> traditional = Language.get('zh-Hant').writing_population()
+>>> traditional
+36863340
 
-    >>> simplified = Language.get('zh-Hans').writing_population()
-    >>> all == traditional + simplified
-    True
+>>> simplified = Language.get('zh-Hans').writing_population()
+>>> all == traditional + simplified
+True
+```
 
 The estimates for "writing population" are often overestimates, as described
 in the [CLDR documentation on territory data][overestimates]. In most cases,
@@ -537,10 +607,12 @@ write in a _different_ language.
 
 Like `.speaking_population()`, this can be limited to a particular territory:
 
-    >>> Language.get('zh-Hant-HK').writing_population()
-    6439733
-    >>> Language.get('zh-Hans-HK').writing_population()
-    338933
+```python
+>>> Language.get('zh-Hant-HK').writing_population()
+6439733
+>>> Language.get('zh-Hans-HK').writing_population()
+338933
+```
 
 
 ## Comparing and matching languages
@@ -584,42 +656,46 @@ The `max_distance` parameter lets you set a cutoff on what counts as language
 support. It has a default of 25, a value that is probably okay for simple
 cases of i18n, but you might want to set it lower to require more precision.
 
-    >>> closest_supported_match('fr', ['de', 'en', 'fr'])
-    'fr'
+```python
+>>> closest_supported_match('fr', ['de', 'en', 'fr'])
+'fr'
 
-    >>> closest_supported_match('pt', ['pt-BR', 'pt-PT'])
-    'pt-BR'
+>>> closest_supported_match('pt', ['pt-BR', 'pt-PT'])
+'pt-BR'
 
-    >>> closest_supported_match('en-AU', ['en-GB', 'en-US'])
-    'en-GB'
+>>> closest_supported_match('en-AU', ['en-GB', 'en-US'])
+'en-GB'
 
-    >>> closest_supported_match('af', ['en', 'nl', 'zu'])
-    'nl'
+>>> closest_supported_match('af', ['en', 'nl', 'zu'])
+'nl'
 
-    >>> closest_supported_match('und', ['en', 'und'])
-    'und'
+>>> closest_supported_match('und', ['en', 'und'])
+'und'
 
-    >>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10))
-    None
+>>> print(closest_supported_match('af', ['en', 'nl', 'zu'], max_distance=10))
+None
+```
 
 A similar function is `closest_match(desired_language, supported_language)`,
 which returns both the best matching language tag and the distance. If there is
 no match, it returns ('und', 1000).
 
-    >>> closest_match('fr', ['de', 'en', 'fr'])
-    ('fr', 0)
+```python
+>>> closest_match('fr', ['de', 'en', 'fr'])
+('fr', 0)
 
-    >>> closest_match('sh', ['hr', 'bs', 'sr-Latn', 'sr-Cyrl'])
-    ('sr-Latn', 0)
+>>> closest_match('sh', ['hr', 'bs', 'sr-Latn', 'sr-Cyrl'])
+('sr-Latn', 0)
 
-    >>> closest_match('id', ['zsm', 'mhp'])
-    ('zsm', 14)
+>>> closest_match('id', ['zsm', 'mhp'])
+('zsm', 14)
 
-    >>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
-    ('und', 1000)
+>>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
+('und', 1000)
 
-    >>> closest_match('ja', ['ja-Latn-hepburn', 'en'], max_distance=60)
-    ('ja-Latn-hepburn', 50)
+>>> closest_match('ja', ['ja-Latn-hepburn', 'en'], max_distance=60)
+('ja-Latn-hepburn', 50)
+```
 
 ## Further API documentation