From f0c625b2ad69c84281e7a2c426235d376741d8ae Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Wed, 12 Apr 2023 20:27:09 -0500 Subject: [PATCH] Add language util (#91290) * Add language util * Add no match tests * Update tests/util/test_language.py Co-authored-by: Paulus Schoutsen --------- Co-authored-by: Paulus Schoutsen --- homeassistant/util/language.py | 145 +++++++++++++++++++++++++++++++++ tests/util/test_language.py | 123 ++++++++++++++++++++++++++++ 2 files changed, 268 insertions(+) create mode 100644 homeassistant/util/language.py create mode 100644 tests/util/test_language.py diff --git a/homeassistant/util/language.py b/homeassistant/util/language.py new file mode 100644 index 00000000000..06c4915c902 --- /dev/null +++ b/homeassistant/util/language.py @@ -0,0 +1,145 @@ +"""Helper methods for language selection in Home Assistant.""" +from __future__ import annotations + +from collections.abc import Iterable +from dataclasses import dataclass +import operator +import re + +SEPARATOR_RE = re.compile(r"[-_]") + + +def preferred_regions( + language: str, + country: str | None = None, + code: str | None = None, +) -> Iterable[str | None]: + """Yield preferred regions for a language based on country/code hints.""" + if country is not None: + yield country.upper() + + if language == "en": + # Prefer U.S. English if no country + if country is None: + yield "US" + elif language == "zh": + if code == "Hant": + yield "HK" + elif code == "Hans": + yield "TW" + else: + # Prefer China if no matching code + yield "CN" + + # fr -> fr-FR + yield language.upper() + + +def is_region(language: str, region: str | None) -> bool: + """Return true if region is not known to be a script/code instead.""" + if language == "es": + return region != "419" + + if language == "sr": + return region != "Latn" + + if language == "zh": + return region not in ("Hans", "Hant") + + return True + + +@dataclass +class Dialect: + """Language with optional region and script/code.""" + + language: str + region: str | None + code: str | None = None + + def __post_init__(self) -> None: + """Fix casing of language/region.""" + # Languages are lower-cased + self.language = self.language.casefold() + + if self.region is not None: + # Regions are upper-cased + self.region = self.region.upper() + + def score(self, dialect: Dialect, country: str | None = None) -> int: + """Return score for match with another dialect where higher is better. + + Score < 0 indicates a failure to match. + """ + if self.language != dialect.language: + # Not a match + return -1 + + if self.region == dialect.region: + # Language + region match + return 1 + + pref_regions: set[str | None] = set() + if (self.region is None) or (dialect.region is None): + # Generate a set of preferred regions + pref_regions = set( + preferred_regions( + self.language, + country=country, + code=self.code, + ) + ) + + # Replace missing regions with preferred + regions = pref_regions if self.region is None else {self.region} + other_regions = pref_regions if dialect.region is None else {dialect.region} + + # Better match if there is overlap in regions + return 1 if regions.intersection(other_regions) else 0 + + @staticmethod + def parse(tag: str) -> Dialect: + """Parse language tag into language/region/code.""" + parts = SEPARATOR_RE.split(tag, maxsplit=1) + language = parts[0] + region: str | None = None + code: str | None = None + + if len(parts) > 1: + region_or_code = parts[1] + if is_region(language, region_or_code): + # US, GB, etc. + region = region_or_code + else: + # Hant, 419, etc. + code = region_or_code + + return Dialect( + language=language, + region=region, + code=code, + ) + + +def matches( + target: str, supported: Iterable[str], country: str | None = None +) -> list[str]: + """Return a sorted list of matching language tags based on a target tag and country hint.""" + target_dialect = Dialect.parse(target) + + # Higher score is better + scored = sorted( + ( + ( + dialect := Dialect.parse(tag), + target_dialect.score(dialect, country=country), + tag, + ) + for tag in supported + ), + key=operator.itemgetter(1), + reverse=True, + ) + + # Score < 0 is not a match + return [tag for _dialect, score, tag in scored if score >= 0] diff --git a/tests/util/test_language.py b/tests/util/test_language.py new file mode 100644 index 00000000000..4dfaa8307d3 --- /dev/null +++ b/tests/util/test_language.py @@ -0,0 +1,123 @@ +"""Test Home Assistant language util methods.""" +from __future__ import annotations + +from homeassistant.util import language + + +def test_region_match() -> None: + """Test that an exact language/region match is preferred.""" + assert language.matches("en-GB", ["fr-Fr", "en-US", "en-GB"]) == [ + "en-GB", + "en-US", + ] + + +def test_no_match() -> None: + """Test that an empty list is returned when there is no match.""" + assert ( + language.matches( + "en-US", + ["de-DE", "fr-FR", "zh"], + ) + == [] + ) + + assert ( + language.matches( + "en", + ["de-DE", "fr-FR", "zh"], + ) + == [] + ) + + assert language.matches("en", []) == [] + + +def test_prefer_us_english() -> None: + """Test that U.S. English is preferred when no region is provided.""" + assert language.matches("en", ["en-GB", "en-US", "fr-FR"]) == [ + "en-US", + "en-GB", + ] + + +def test_country_preferred() -> None: + """Test that country hint disambiguates.""" + assert language.matches( + "en", + ["fr-Fr", "en-US", "en-GB"], + country="GB", + ) == [ + "en-GB", + "en-US", + ] + + +def test_language_as_region() -> None: + """Test that the language itself can be interpreted as a region.""" + assert language.matches( + "fr", + ["en-US", "en-GB", "fr-CA", "fr-FR"], + ) == [ + "fr-FR", + "fr-CA", + ] + + +def test_zh_hant() -> None: + """Test that the zh-Hant defaults to HK.""" + assert language.matches( + "zh-Hant", + ["en-US", "en-GB", "zh-CN", "zh-HK", "zh-TW"], + ) == [ + "zh-HK", + "zh-CN", + "zh-TW", + ] + + +def test_zh_hans() -> None: + """Test that the zh-Hans defaults to TW.""" + assert language.matches( + "zh-Hans", + ["en-US", "en-GB", "zh-CN", "zh-HK", "zh-TW"], + ) == [ + "zh-TW", + "zh-CN", + "zh-HK", + ] + + +def test_zh_no_code() -> None: + """Test that the zh defaults to CN.""" + assert language.matches( + "zh", + ["en-US", "en-GB", "zh-CN", "zh-HK", "zh-TW"], + ) == [ + "zh-CN", + "zh-HK", + "zh-TW", + ] + + +def test_es_419() -> None: + """Test that the es-419 matches es dialects.""" + assert language.matches( + "es-419", + ["en-US", "en-GB", "es-CL", "es-US", "es-ES"], + ) == [ + "es-ES", + "es-CL", + "es-US", + ] + + +def test_sr_latn() -> None: + """Test that the sr_Latn matches sr dialects.""" + assert language.matches( + "sr-Latn", + ["en-US", "en-GB", "sr-CS", "sr-RS"], + ) == [ + "sr-CS", + "sr-RS", + ]